| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT |
| ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT |
| ; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32 |
| ; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64 |
| |
| define i32 @vqdot_vv(<16 x i8> %a, <16 x i8> %b) { |
| ; NODOT-LABEL: vqdot_vv: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; NODOT-NEXT: vsext.vf2 v12, v8 |
| ; NODOT-NEXT: vsext.vf2 v14, v9 |
| ; NODOT-NEXT: vwmul.vv v8, v12, v14 |
| ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; NODOT-NEXT: vmv.s.x v12, zero |
| ; NODOT-NEXT: vredsum.vs v8, v8, v12 |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT-LABEL: vqdot_vv: |
| ; DOT: # %bb.0: # %entry |
| ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT-NEXT: vmv.v.i v10, 0 |
| ; DOT-NEXT: vqdot.vv v10, v8, v9 |
| ; DOT-NEXT: vmv.s.x v8, zero |
| ; DOT-NEXT: vredsum.vs v8, v10, v8 |
| ; DOT-NEXT: vmv.x.s a0, v8 |
| ; DOT-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %b.sext = sext <16 x i8> %b to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.sext, %b.sext |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdot_vx_constant(<16 x i8> %a) { |
| ; CHECK-LABEL: vqdot_vx_constant: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; CHECK-NEXT: vsext.vf2 v12, v8 |
| ; CHECK-NEXT: li a0, 23 |
| ; CHECK-NEXT: vwmul.vx v8, v12, a0 |
| ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; CHECK-NEXT: vmv.s.x v12, zero |
| ; CHECK-NEXT: vredsum.vs v8, v8, v12 |
| ; CHECK-NEXT: vmv.x.s a0, v8 |
| ; CHECK-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.sext, splat (i32 23) |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdot_vx_constant_swapped(<16 x i8> %a) { |
| ; CHECK-LABEL: vqdot_vx_constant_swapped: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; CHECK-NEXT: vsext.vf2 v12, v8 |
| ; CHECK-NEXT: li a0, 23 |
| ; CHECK-NEXT: vwmul.vx v8, v12, a0 |
| ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; CHECK-NEXT: vmv.s.x v12, zero |
| ; CHECK-NEXT: vredsum.vs v8, v8, v12 |
| ; CHECK-NEXT: vmv.x.s a0, v8 |
| ; CHECK-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> splat (i32 23), %a.sext |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdotu_vv(<16 x i8> %a, <16 x i8> %b) { |
| ; NODOT-LABEL: vqdotu_vv: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma |
| ; NODOT-NEXT: vwmulu.vv v10, v8, v9 |
| ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; NODOT-NEXT: vmv.s.x v8, zero |
| ; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma |
| ; NODOT-NEXT: vwredsumu.vs v8, v10, v8 |
| ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT-LABEL: vqdotu_vv: |
| ; DOT: # %bb.0: # %entry |
| ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT-NEXT: vmv.v.i v10, 0 |
| ; DOT-NEXT: vqdotu.vv v10, v8, v9 |
| ; DOT-NEXT: vmv.s.x v8, zero |
| ; DOT-NEXT: vredsum.vs v8, v10, v8 |
| ; DOT-NEXT: vmv.x.s a0, v8 |
| ; DOT-NEXT: ret |
| entry: |
| %a.zext = zext <16 x i8> %a to <16 x i32> |
| %b.zext = zext <16 x i8> %b to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.zext, %b.zext |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdotu_vx_constant(<16 x i8> %a) { |
| ; CHECK-LABEL: vqdotu_vx_constant: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; CHECK-NEXT: vzext.vf2 v12, v8 |
| ; CHECK-NEXT: li a0, 123 |
| ; CHECK-NEXT: vwmulu.vx v8, v12, a0 |
| ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; CHECK-NEXT: vmv.s.x v12, zero |
| ; CHECK-NEXT: vredsum.vs v8, v8, v12 |
| ; CHECK-NEXT: vmv.x.s a0, v8 |
| ; CHECK-NEXT: ret |
| entry: |
| %a.zext = zext <16 x i8> %a to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.zext, splat (i32 123) |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdotsu_vv(<16 x i8> %a, <16 x i8> %b) { |
| ; NODOT-LABEL: vqdotsu_vv: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; NODOT-NEXT: vsext.vf2 v12, v8 |
| ; NODOT-NEXT: vzext.vf2 v14, v9 |
| ; NODOT-NEXT: vwmulsu.vv v8, v12, v14 |
| ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; NODOT-NEXT: vmv.s.x v12, zero |
| ; NODOT-NEXT: vredsum.vs v8, v8, v12 |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT-LABEL: vqdotsu_vv: |
| ; DOT: # %bb.0: # %entry |
| ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT-NEXT: vmv.v.i v10, 0 |
| ; DOT-NEXT: vqdotsu.vv v10, v8, v9 |
| ; DOT-NEXT: vmv.s.x v8, zero |
| ; DOT-NEXT: vredsum.vs v8, v10, v8 |
| ; DOT-NEXT: vmv.x.s a0, v8 |
| ; DOT-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %b.zext = zext <16 x i8> %b to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.sext, %b.zext |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vqdotsu_vv_swapped(<16 x i8> %a, <16 x i8> %b) { |
| ; NODOT-LABEL: vqdotsu_vv_swapped: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; NODOT-NEXT: vsext.vf2 v12, v8 |
| ; NODOT-NEXT: vzext.vf2 v14, v9 |
| ; NODOT-NEXT: vwmulsu.vv v8, v12, v14 |
| ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; NODOT-NEXT: vmv.s.x v12, zero |
| ; NODOT-NEXT: vredsum.vs v8, v8, v12 |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT-LABEL: vqdotsu_vv_swapped: |
| ; DOT: # %bb.0: # %entry |
| ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT-NEXT: vmv.v.i v10, 0 |
| ; DOT-NEXT: vqdotsu.vv v10, v8, v9 |
| ; DOT-NEXT: vmv.s.x v8, zero |
| ; DOT-NEXT: vredsum.vs v8, v10, v8 |
| ; DOT-NEXT: vmv.x.s a0, v8 |
| ; DOT-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %b.zext = zext <16 x i8> %b to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %b.zext, %a.sext |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vdotqsu_vx_constant(<16 x i8> %a) { |
| ; CHECK-LABEL: vdotqsu_vx_constant: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; CHECK-NEXT: vsext.vf2 v12, v8 |
| ; CHECK-NEXT: li a0, 123 |
| ; CHECK-NEXT: vwmul.vx v8, v12, a0 |
| ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; CHECK-NEXT: vmv.s.x v12, zero |
| ; CHECK-NEXT: vredsum.vs v8, v8, v12 |
| ; CHECK-NEXT: vmv.x.s a0, v8 |
| ; CHECK-NEXT: ret |
| entry: |
| %a.sext = sext <16 x i8> %a to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.sext, splat (i32 123) |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @vdotqus_vx_constant(<16 x i8> %a) { |
| ; CHECK-LABEL: vdotqus_vx_constant: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma |
| ; CHECK-NEXT: vzext.vf2 v12, v8 |
| ; CHECK-NEXT: li a0, -23 |
| ; CHECK-NEXT: vmv.v.x v14, a0 |
| ; CHECK-NEXT: vwmulsu.vv v8, v14, v12 |
| ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma |
| ; CHECK-NEXT: vmv.s.x v12, zero |
| ; CHECK-NEXT: vredsum.vs v8, v8, v12 |
| ; CHECK-NEXT: vmv.x.s a0, v8 |
| ; CHECK-NEXT: ret |
| entry: |
| %a.zext = zext <16 x i8> %a to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %a.zext, splat (i32 -23) |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul) |
| ret i32 %res |
| } |
| |
| define i32 @reduce_of_sext(<16 x i8> %a) { |
| ; NODOT-LABEL: reduce_of_sext: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma |
| ; NODOT-NEXT: vsext.vf4 v12, v8 |
| ; NODOT-NEXT: vmv.s.x v8, zero |
| ; NODOT-NEXT: vredsum.vs v8, v12, v8 |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT32-LABEL: reduce_of_sext: |
| ; DOT32: # %bb.0: # %entry |
| ; DOT32-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT32-NEXT: vmv.v.i v9, 0 |
| ; DOT32-NEXT: lui a0, 4112 |
| ; DOT32-NEXT: addi a0, a0, 257 |
| ; DOT32-NEXT: vqdot.vx v9, v8, a0 |
| ; DOT32-NEXT: vmv.s.x v8, zero |
| ; DOT32-NEXT: vredsum.vs v8, v9, v8 |
| ; DOT32-NEXT: vmv.x.s a0, v8 |
| ; DOT32-NEXT: ret |
| ; |
| ; DOT64-LABEL: reduce_of_sext: |
| ; DOT64: # %bb.0: # %entry |
| ; DOT64-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT64-NEXT: vmv.v.i v9, 0 |
| ; DOT64-NEXT: lui a0, 4112 |
| ; DOT64-NEXT: addiw a0, a0, 257 |
| ; DOT64-NEXT: vqdot.vx v9, v8, a0 |
| ; DOT64-NEXT: vmv.s.x v8, zero |
| ; DOT64-NEXT: vredsum.vs v8, v9, v8 |
| ; DOT64-NEXT: vmv.x.s a0, v8 |
| ; DOT64-NEXT: ret |
| entry: |
| %a.ext = sext <16 x i8> %a to <16 x i32> |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext) |
| ret i32 %res |
| } |
| |
| define i32 @reduce_of_zext(<16 x i8> %a) { |
| ; NODOT-LABEL: reduce_of_zext: |
| ; NODOT: # %bb.0: # %entry |
| ; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma |
| ; NODOT-NEXT: vzext.vf4 v12, v8 |
| ; NODOT-NEXT: vmv.s.x v8, zero |
| ; NODOT-NEXT: vredsum.vs v8, v12, v8 |
| ; NODOT-NEXT: vmv.x.s a0, v8 |
| ; NODOT-NEXT: ret |
| ; |
| ; DOT32-LABEL: reduce_of_zext: |
| ; DOT32: # %bb.0: # %entry |
| ; DOT32-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT32-NEXT: vmv.v.i v9, 0 |
| ; DOT32-NEXT: lui a0, 4112 |
| ; DOT32-NEXT: addi a0, a0, 257 |
| ; DOT32-NEXT: vqdotu.vx v9, v8, a0 |
| ; DOT32-NEXT: vmv.s.x v8, zero |
| ; DOT32-NEXT: vredsum.vs v8, v9, v8 |
| ; DOT32-NEXT: vmv.x.s a0, v8 |
| ; DOT32-NEXT: ret |
| ; |
| ; DOT64-LABEL: reduce_of_zext: |
| ; DOT64: # %bb.0: # %entry |
| ; DOT64-NEXT: vsetivli zero, 4, e32, m1, ta, ma |
| ; DOT64-NEXT: vmv.v.i v9, 0 |
| ; DOT64-NEXT: lui a0, 4112 |
| ; DOT64-NEXT: addiw a0, a0, 257 |
| ; DOT64-NEXT: vqdotu.vx v9, v8, a0 |
| ; DOT64-NEXT: vmv.s.x v8, zero |
| ; DOT64-NEXT: vredsum.vs v8, v9, v8 |
| ; DOT64-NEXT: vmv.x.s a0, v8 |
| ; DOT64-NEXT: ret |
| entry: |
| %a.ext = zext <16 x i8> %a to <16 x i32> |
| %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext) |
| ret i32 %res |
| } |