| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2 |
| ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM |
| ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME |
| |
| define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: udot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| ; CHECK-SVE2-LABEL: udot_wide: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_wide: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_wide: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: sdot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| ; CHECK-SVE2-LABEL: sdot_wide: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_wide: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_wide: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: usdot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: uunpklo z3.h, z1.b |
| ; CHECK-SVE2-NEXT: sunpklo z4.h, z2.b |
| ; CHECK-SVE2-NEXT: ptrue p0.s |
| ; CHECK-SVE2-NEXT: uunpkhi z1.h, z1.b |
| ; CHECK-SVE2-NEXT: sunpkhi z2.h, z2.b |
| ; CHECK-SVE2-NEXT: uunpklo z5.s, z3.h |
| ; CHECK-SVE2-NEXT: sunpklo z6.s, z4.h |
| ; CHECK-SVE2-NEXT: uunpkhi z3.s, z3.h |
| ; CHECK-SVE2-NEXT: sunpkhi z4.s, z4.h |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s |
| ; CHECK-SVE2-NEXT: uunpklo z5.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpklo z6.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z3.s, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z1.s, z2.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: usdot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: usdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: usdot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: usdot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: sudot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sunpklo z3.h, z1.b |
| ; CHECK-SVE2-NEXT: uunpklo z4.h, z2.b |
| ; CHECK-SVE2-NEXT: ptrue p0.s |
| ; CHECK-SVE2-NEXT: sunpkhi z1.h, z1.b |
| ; CHECK-SVE2-NEXT: uunpkhi z2.h, z2.b |
| ; CHECK-SVE2-NEXT: sunpklo z5.s, z3.h |
| ; CHECK-SVE2-NEXT: uunpklo z6.s, z4.h |
| ; CHECK-SVE2-NEXT: sunpkhi z3.s, z3.h |
| ; CHECK-SVE2-NEXT: uunpkhi z4.s, z4.h |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s |
| ; CHECK-SVE2-NEXT: sunpklo z5.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpklo z6.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z3.s, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s |
| ; CHECK-SVE2-NEXT: mla z0.s, p0/m, z1.s, z2.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sudot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: usdot z0.s, z2.b, z1.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sudot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: usdot z0.s, z2.b, z1.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: udot_8to64: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-NEXT: udot z4.s, z2.b, z3.b |
| ; CHECK-SVE2-NEXT: uaddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-NEXT: uaddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: udot z4.s, z2.b, z3.b |
| ; CHECK-SVE2-I8MM-NEXT: uaddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: uaddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_8to64: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: udot z4.s, z2.b, z3.b |
| ; CHECK-SME-NEXT: uaddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: uaddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> |
| %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( |
| <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ |
| ; CHECK-SVE2-LABEL: sdot_8to64: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-NEXT: sdot z4.s, z2.b, z3.b |
| ; CHECK-SVE2-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: sdot z4.s, z2.b, z3.b |
| ; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_8to64: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: sdot z4.s, z2.b, z3.b |
| ; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> |
| %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( |
| <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ |
| ; CHECK-SVE2-LABEL: usdot_8to64: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: uunpkhi z4.h, z2.b |
| ; CHECK-SVE2-NEXT: uunpklo z2.h, z2.b |
| ; CHECK-SVE2-NEXT: sunpkhi z5.h, z3.b |
| ; CHECK-SVE2-NEXT: sunpklo z3.h, z3.b |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: uunpklo z6.s, z4.h |
| ; CHECK-SVE2-NEXT: uunpklo z7.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z24.s, z5.h |
| ; CHECK-SVE2-NEXT: sunpklo z25.s, z3.h |
| ; CHECK-SVE2-NEXT: uunpkhi z4.s, z4.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpkhi z5.s, z5.h |
| ; CHECK-SVE2-NEXT: sunpkhi z3.s, z3.h |
| ; CHECK-SVE2-NEXT: uunpklo z26.d, z6.s |
| ; CHECK-SVE2-NEXT: uunpklo z27.d, z7.s |
| ; CHECK-SVE2-NEXT: sunpklo z28.d, z24.s |
| ; CHECK-SVE2-NEXT: sunpklo z29.d, z25.s |
| ; CHECK-SVE2-NEXT: uunpkhi z6.d, z6.s |
| ; CHECK-SVE2-NEXT: uunpkhi z7.d, z7.s |
| ; CHECK-SVE2-NEXT: sunpkhi z24.d, z24.s |
| ; CHECK-SVE2-NEXT: sunpkhi z25.d, z25.s |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d |
| ; CHECK-SVE2-NEXT: uunpklo z26.d, z4.s |
| ; CHECK-SVE2-NEXT: sunpklo z28.d, z5.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d |
| ; CHECK-SVE2-NEXT: uunpklo z27.d, z2.s |
| ; CHECK-SVE2-NEXT: sunpklo z29.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: sunpkhi z5.d, z5.s |
| ; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z6.d, z24.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z7.d, z25.d |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z4.d, z5.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z2.d, z3.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: usdot_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: usdot z4.s, z2.b, z3.b |
| ; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: usdot_8to64: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: usdot z4.s, z2.b, z3.b |
| ; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> |
| %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( |
| <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-SVE2-LABEL: sudot_8to64: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sunpkhi z4.h, z2.b |
| ; CHECK-SVE2-NEXT: sunpklo z2.h, z2.b |
| ; CHECK-SVE2-NEXT: uunpkhi z5.h, z3.b |
| ; CHECK-SVE2-NEXT: uunpklo z3.h, z3.b |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: sunpklo z6.s, z4.h |
| ; CHECK-SVE2-NEXT: sunpklo z7.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpklo z24.s, z5.h |
| ; CHECK-SVE2-NEXT: uunpklo z25.s, z3.h |
| ; CHECK-SVE2-NEXT: sunpkhi z4.s, z4.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpkhi z5.s, z5.h |
| ; CHECK-SVE2-NEXT: uunpkhi z3.s, z3.h |
| ; CHECK-SVE2-NEXT: sunpklo z26.d, z6.s |
| ; CHECK-SVE2-NEXT: sunpklo z27.d, z7.s |
| ; CHECK-SVE2-NEXT: uunpklo z28.d, z24.s |
| ; CHECK-SVE2-NEXT: uunpklo z29.d, z25.s |
| ; CHECK-SVE2-NEXT: sunpkhi z6.d, z6.s |
| ; CHECK-SVE2-NEXT: sunpkhi z7.d, z7.s |
| ; CHECK-SVE2-NEXT: uunpkhi z24.d, z24.s |
| ; CHECK-SVE2-NEXT: uunpkhi z25.d, z25.s |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d |
| ; CHECK-SVE2-NEXT: sunpklo z26.d, z4.s |
| ; CHECK-SVE2-NEXT: uunpklo z28.d, z5.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d |
| ; CHECK-SVE2-NEXT: sunpklo z27.d, z2.s |
| ; CHECK-SVE2-NEXT: uunpklo z29.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: uunpkhi z5.d, z5.s |
| ; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z6.d, z24.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z7.d, z25.d |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d |
| ; CHECK-SVE2-NEXT: mla z1.d, p0/m, z4.d, z5.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z2.d, z3.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sudot_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: usdot z4.s, z3.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sudot_8to64: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: usdot z4.s, z3.b, z2.b |
| ; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> |
| %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( |
| <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){ |
| ; CHECK-SVE2-LABEL: udot_no_bin_op: |
| ; CHECK-SVE2: // %bb.0: |
| ; CHECK-SVE2-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op: |
| ; CHECK-SVE2-I8MM: // %bb.0: |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_no_bin_op: |
| ; CHECK-SME: // %bb.0: |
| ; CHECK-SME-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){ |
| ; CHECK-SVE2-LABEL: sdot_no_bin_op: |
| ; CHECK-SVE2: // %bb.0: |
| ; CHECK-SVE2-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op: |
| ; CHECK-SVE2-I8MM: // %bb.0: |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_no_bin_op: |
| ; CHECK-SME: // %bb.0: |
| ; CHECK-SME-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){ |
| ; CHECK-SVE2-LABEL: udot_no_bin_op_wide: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_no_bin_op_wide: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){ |
| ; CHECK-SVE2-LABEL: sdot_no_bin_op_wide: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_no_bin_op_wide: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z2.h, #1 // =0x1 |
| ; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){ |
| ; CHECK-SVE2-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-SVE2: // %bb.0: |
| ; CHECK-SVE2-NEXT: movi v3.2d, #0000000000000000 |
| ; CHECK-SVE2-NEXT: mov z4.b, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: udot z3.s, z2.b, z4.b |
| ; CHECK-SVE2-NEXT: uaddwb z0.d, z0.d, z3.s |
| ; CHECK-SVE2-NEXT: uaddwt z0.d, z0.d, z3.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: |
| ; CHECK-SVE2-I8MM-NEXT: movi v3.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: mov z4.b, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: udot z3.s, z2.b, z4.b |
| ; CHECK-SVE2-I8MM-NEXT: uaddwb z0.d, z0.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uaddwt z0.d, z0.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-SME: // %bb.0: |
| ; CHECK-SME-NEXT: mov z3.b, #1 // =0x1 |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: udot z4.s, z2.b, z3.b |
| ; CHECK-SME-NEXT: uaddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: uaddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){ |
| ; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-SVE2: // %bb.0: |
| ; CHECK-SVE2-NEXT: movi v3.2d, #0000000000000000 |
| ; CHECK-SVE2-NEXT: mov z4.b, #1 // =0x1 |
| ; CHECK-SVE2-NEXT: sdot z3.s, z2.b, z4.b |
| ; CHECK-SVE2-NEXT: saddwb z0.d, z0.d, z3.s |
| ; CHECK-SVE2-NEXT: saddwt z0.d, z0.d, z3.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-SVE2-I8MM: // %bb.0: |
| ; CHECK-SVE2-I8MM-NEXT: movi v3.2d, #0000000000000000 |
| ; CHECK-SVE2-I8MM-NEXT: mov z4.b, #1 // =0x1 |
| ; CHECK-SVE2-I8MM-NEXT: sdot z3.s, z2.b, z4.b |
| ; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-SME: // %bb.0: |
| ; CHECK-SME-NEXT: mov z3.b, #1 // =0x1 |
| ; CHECK-SME-NEXT: mov z4.s, #0 // =0x0 |
| ; CHECK-SME-NEXT: sdot z4.s, z2.b, z3.b |
| ; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s |
| ; CHECK-SME-NEXT: ret |
| %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { |
| ; CHECK-SVE2-LABEL: not_udot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SVE2-NEXT: umlalb z0.s, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: umlalt z0.s, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: not_udot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: umlalb z0.s, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: umlalt z0.s, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: not_udot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SME-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SME-NEXT: umlalb z0.s, z1.h, z2.h |
| ; CHECK-SME-NEXT: umlalt z0.s, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32> |
| %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32> |
| %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) { |
| ; CHECK-SVE2-LABEL: not_udot_wide: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z2.s, z2.s, #0xffff |
| ; CHECK-SVE2-NEXT: and z1.s, z1.s, #0xffff |
| ; CHECK-SVE2-NEXT: umlalb z0.d, z1.s, z2.s |
| ; CHECK-SVE2-NEXT: umlalt z0.d, z1.s, z2.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: not_udot_wide: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z2.s, z2.s, #0xffff |
| ; CHECK-SVE2-I8MM-NEXT: and z1.s, z1.s, #0xffff |
| ; CHECK-SVE2-I8MM-NEXT: umlalb z0.d, z1.s, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: umlalt z0.d, z1.s, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: not_udot_wide: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z2.s, z2.s, #0xffff |
| ; CHECK-SME-NEXT: and z1.s, z1.s, #0xffff |
| ; CHECK-SME-NEXT: umlalb z0.d, z1.s, z2.s |
| ; CHECK-SME-NEXT: umlalt z0.d, z1.s, z2.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64> |
| %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64> |
| %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| ; CHECK-SVE2-LABEL: not_usdot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: not_usdot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: not_usdot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| ; CHECK-SVE2-LABEL: not_sudot: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: not_sudot: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: not_sudot: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: udot_different_types: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_different_types: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_different_types: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SME-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: sdot_different_types: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: ptrue p0.h |
| ; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_different_types: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_different_types: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: ptrue p0.h |
| ; CHECK-SME-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: usdot_different_types: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: ptrue p0.h |
| ; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: usdot_different_types: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: usdot_different_types: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: ptrue p0.h |
| ; CHECK-SME-NEXT: uunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: uunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: uunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: sunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: sunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: sunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: uunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: sunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: uunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: sunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: sudot_different_types: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-NEXT: ptrue p0.d |
| ; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sudot_different_types: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sudot_different_types: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SME-NEXT: sunpklo z3.s, z1.h |
| ; CHECK-SME-NEXT: ptrue p0.d |
| ; CHECK-SME-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: uunpklo z4.s, z2.h |
| ; CHECK-SME-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: sunpklo z5.d, z3.s |
| ; CHECK-SME-NEXT: sunpkhi z3.d, z3.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z4.s |
| ; CHECK-SME-NEXT: uunpkhi z4.d, z4.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: sunpklo z5.d, z1.s |
| ; CHECK-SME-NEXT: uunpklo z6.d, z2.s |
| ; CHECK-SME-NEXT: sunpkhi z1.d, z1.s |
| ; CHECK-SME-NEXT: uunpkhi z2.d, z2.s |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d |
| ; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 2 x i64> %partial.reduce |
| } |
| |
| define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: udot_nxv8i8_promote: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_nxv8i8_promote: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SME-NEXT: and z1.h, z1.h, #0xff |
| ; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16> |
| %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16> |
| %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult) |
| ret <vscale x 2 x i16> %partial.reduce |
| } |
| |
| define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){ |
| ; CHECK-SVE2-LABEL: sdot_nxv8i8_promote: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: ptrue p0.h |
| ; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-NEXT: sxtb z1.h, p0/m, z1.h |
| ; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: ptrue p0.h |
| ; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sxtb z1.h, p0/m, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_nxv8i8_promote: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: ptrue p0.h |
| ; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h |
| ; CHECK-SME-NEXT: sxtb z1.h, p0/m, z1.h |
| ; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16> |
| %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16> |
| %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult) |
| ret <vscale x 2 x i16> %partial.reduce |
| } |
| |
| define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { |
| ; CHECK-SVE2-LABEL: partial_reduce_only_split_acc: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: and z3.h, z3.h, #0xff |
| ; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-NEXT: udot z0.d, z2.h, z3.h |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: and z3.h, z3.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.d, z2.h, z3.h |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: partial_reduce_only_split_acc: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: and z3.h, z3.h, #0xff |
| ; CHECK-SME-NEXT: and z2.h, z2.h, #0xff |
| ; CHECK-SME-NEXT: udot z0.d, z2.h, z3.h |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64> |
| %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> |
| %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( |
| <vscale x 4 x i64> %acc, <vscale x 8 x i64> %mult) |
| ret <vscale x 4 x i64> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) { |
| ; CHECK-SVE2-LABEL: sdot_imm: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_imm: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_imm: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1) |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) { |
| ; CHECK-SVE2-LABEL: sdot_imm_does_not_fit: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: sunpklo z2.h, z1.b |
| ; CHECK-SVE2-NEXT: sunpkhi z1.h, z1.b |
| ; CHECK-SVE2-NEXT: sunpklo z3.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: sunpklo z4.s, z1.h |
| ; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SVE2-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z2.h, z1.b |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.h, z1.b |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: sdot_imm_does_not_fit: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: sunpklo z2.h, z1.b |
| ; CHECK-SME-NEXT: sunpkhi z1.h, z1.b |
| ; CHECK-SME-NEXT: sunpklo z3.s, z2.h |
| ; CHECK-SME-NEXT: sunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: sunpklo z4.s, z1.h |
| ; CHECK-SME-NEXT: sunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SME-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SME-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SME-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SME-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256) |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) { |
| ; CHECK-SVE2-LABEL: udot_imm: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_imm: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_imm: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: mov z2.b, #-1 // =0xffffffffffffffff |
| ; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255) |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |
| |
| define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) { |
| ; CHECK-SVE2-LABEL: udot_imm_does_not_fit: |
| ; CHECK-SVE2: // %bb.0: // %entry |
| ; CHECK-SVE2-NEXT: uunpklo z2.h, z1.b |
| ; CHECK-SVE2-NEXT: uunpkhi z1.h, z1.b |
| ; CHECK-SVE2-NEXT: uunpklo z3.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-NEXT: uunpklo z4.s, z1.h |
| ; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SVE2-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SVE2-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SVE2-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SVE2-NEXT: ret |
| ; |
| ; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit: |
| ; CHECK-SVE2-I8MM: // %bb.0: // %entry |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z2.h, z1.b |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.h, z1.b |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SVE2-I8MM-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SVE2-I8MM-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SVE2-I8MM-NEXT: ret |
| ; |
| ; CHECK-SME-LABEL: udot_imm_does_not_fit: |
| ; CHECK-SME: // %bb.0: // %entry |
| ; CHECK-SME-NEXT: uunpklo z2.h, z1.b |
| ; CHECK-SME-NEXT: uunpkhi z1.h, z1.b |
| ; CHECK-SME-NEXT: uunpklo z3.s, z2.h |
| ; CHECK-SME-NEXT: uunpkhi z2.s, z2.h |
| ; CHECK-SME-NEXT: uunpklo z4.s, z1.h |
| ; CHECK-SME-NEXT: uunpkhi z1.s, z1.h |
| ; CHECK-SME-NEXT: lsl z4.s, z4.s, #8 |
| ; CHECK-SME-NEXT: lsl z2.s, z2.s, #8 |
| ; CHECK-SME-NEXT: lsl z3.s, z3.s, #8 |
| ; CHECK-SME-NEXT: lsl z1.s, z1.s, #8 |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z3.s |
| ; CHECK-SME-NEXT: add z2.s, z2.s, z4.s |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z2.s |
| ; CHECK-SME-NEXT: add z0.s, z0.s, z1.s |
| ; CHECK-SME-NEXT: ret |
| entry: |
| %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256) |
| %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| ret <vscale x 4 x i32> %partial.reduce |
| } |