blob: e48bc9cdfea4eb329f5baa1e1142294c880d73a1 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32
; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64
define i32 @vqdot_vv(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdot_vv:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; NODOT-NEXT: vsext.vf2 v12, v8
; NODOT-NEXT: vsext.vf2 v14, v9
; NODOT-NEXT: vwmul.vv v8, v12, v14
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdot_vv:
; DOT: # %bb.0: # %entry
; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT-NEXT: vmv.v.i v10, 0
; DOT-NEXT: vqdot.vv v10, v8, v9
; DOT-NEXT: vmv.s.x v8, zero
; DOT-NEXT: vredsum.vs v8, v10, v8
; DOT-NEXT: vmv.x.s a0, v8
; DOT-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%b.sext = sext <16 x i8> %b to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.sext, %b.sext
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdot_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v12, v8
; CHECK-NEXT: li a0, 23
; CHECK-NEXT: vwmul.vx v8, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.sext, splat (i32 23)
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdot_vx_constant_swapped(<16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant_swapped:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v12, v8
; CHECK-NEXT: li a0, 23
; CHECK-NEXT: vwmul.vx v8, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%mul = mul nuw nsw <16 x i32> splat (i32 23), %a.sext
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdotu_vv(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdotu_vv:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; NODOT-NEXT: vwmulu.vv v10, v8, v9
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; NODOT-NEXT: vmv.s.x v8, zero
; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; NODOT-NEXT: vwredsumu.vs v8, v10, v8
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdotu_vv:
; DOT: # %bb.0: # %entry
; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT-NEXT: vmv.v.i v10, 0
; DOT-NEXT: vqdotu.vv v10, v8, v9
; DOT-NEXT: vmv.s.x v8, zero
; DOT-NEXT: vredsum.vs v8, v10, v8
; DOT-NEXT: vmv.x.s a0, v8
; DOT-NEXT: ret
entry:
%a.zext = zext <16 x i8> %a to <16 x i32>
%b.zext = zext <16 x i8> %b to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.zext, %b.zext
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdotu_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vqdotu_vx_constant:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vzext.vf2 v12, v8
; CHECK-NEXT: li a0, 123
; CHECK-NEXT: vwmulu.vx v8, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%a.zext = zext <16 x i8> %a to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.zext, splat (i32 123)
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdotsu_vv(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; NODOT-NEXT: vsext.vf2 v12, v8
; NODOT-NEXT: vzext.vf2 v14, v9
; NODOT-NEXT: vwmulsu.vv v8, v12, v14
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdotsu_vv:
; DOT: # %bb.0: # %entry
; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT-NEXT: vmv.v.i v10, 0
; DOT-NEXT: vqdotsu.vv v10, v8, v9
; DOT-NEXT: vmv.s.x v8, zero
; DOT-NEXT: vredsum.vs v8, v10, v8
; DOT-NEXT: vmv.x.s a0, v8
; DOT-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%b.zext = zext <16 x i8> %b to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.sext, %b.zext
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vqdotsu_vv_swapped(<16 x i8> %a, <16 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv_swapped:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; NODOT-NEXT: vsext.vf2 v12, v8
; NODOT-NEXT: vzext.vf2 v14, v9
; NODOT-NEXT: vwmulsu.vv v8, v12, v14
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; NODOT-NEXT: vmv.s.x v12, zero
; NODOT-NEXT: vredsum.vs v8, v8, v12
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT-LABEL: vqdotsu_vv_swapped:
; DOT: # %bb.0: # %entry
; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT-NEXT: vmv.v.i v10, 0
; DOT-NEXT: vqdotsu.vv v10, v8, v9
; DOT-NEXT: vmv.s.x v8, zero
; DOT-NEXT: vredsum.vs v8, v10, v8
; DOT-NEXT: vmv.x.s a0, v8
; DOT-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%b.zext = zext <16 x i8> %b to <16 x i32>
%mul = mul nuw nsw <16 x i32> %b.zext, %a.sext
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vdotqsu_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vdotqsu_vx_constant:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vsext.vf2 v12, v8
; CHECK-NEXT: li a0, 123
; CHECK-NEXT: vwmul.vx v8, v12, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%a.sext = sext <16 x i8> %a to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.sext, splat (i32 123)
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @vdotqus_vx_constant(<16 x i8> %a) {
; CHECK-LABEL: vdotqus_vx_constant:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vzext.vf2 v12, v8
; CHECK-NEXT: li a0, -23
; CHECK-NEXT: vmv.v.x v14, a0
; CHECK-NEXT: vwmulsu.vv v8, v14, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmv.s.x v12, zero
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
%a.zext = zext <16 x i8> %a to <16 x i32>
%mul = mul nuw nsw <16 x i32> %a.zext, splat (i32 -23)
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %mul)
ret i32 %res
}
define i32 @reduce_of_sext(<16 x i8> %a) {
; NODOT-LABEL: reduce_of_sext:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; NODOT-NEXT: vsext.vf4 v12, v8
; NODOT-NEXT: vmv.s.x v8, zero
; NODOT-NEXT: vredsum.vs v8, v12, v8
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT32-LABEL: reduce_of_sext:
; DOT32: # %bb.0: # %entry
; DOT32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT32-NEXT: vmv.v.i v9, 0
; DOT32-NEXT: lui a0, 4112
; DOT32-NEXT: addi a0, a0, 257
; DOT32-NEXT: vqdot.vx v9, v8, a0
; DOT32-NEXT: vmv.s.x v8, zero
; DOT32-NEXT: vredsum.vs v8, v9, v8
; DOT32-NEXT: vmv.x.s a0, v8
; DOT32-NEXT: ret
;
; DOT64-LABEL: reduce_of_sext:
; DOT64: # %bb.0: # %entry
; DOT64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT64-NEXT: vmv.v.i v9, 0
; DOT64-NEXT: lui a0, 4112
; DOT64-NEXT: addiw a0, a0, 257
; DOT64-NEXT: vqdot.vx v9, v8, a0
; DOT64-NEXT: vmv.s.x v8, zero
; DOT64-NEXT: vredsum.vs v8, v9, v8
; DOT64-NEXT: vmv.x.s a0, v8
; DOT64-NEXT: ret
entry:
%a.ext = sext <16 x i8> %a to <16 x i32>
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
ret i32 %res
}
define i32 @reduce_of_zext(<16 x i8> %a) {
; NODOT-LABEL: reduce_of_zext:
; NODOT: # %bb.0: # %entry
; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; NODOT-NEXT: vzext.vf4 v12, v8
; NODOT-NEXT: vmv.s.x v8, zero
; NODOT-NEXT: vredsum.vs v8, v12, v8
; NODOT-NEXT: vmv.x.s a0, v8
; NODOT-NEXT: ret
;
; DOT32-LABEL: reduce_of_zext:
; DOT32: # %bb.0: # %entry
; DOT32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT32-NEXT: vmv.v.i v9, 0
; DOT32-NEXT: lui a0, 4112
; DOT32-NEXT: addi a0, a0, 257
; DOT32-NEXT: vqdotu.vx v9, v8, a0
; DOT32-NEXT: vmv.s.x v8, zero
; DOT32-NEXT: vredsum.vs v8, v9, v8
; DOT32-NEXT: vmv.x.s a0, v8
; DOT32-NEXT: ret
;
; DOT64-LABEL: reduce_of_zext:
; DOT64: # %bb.0: # %entry
; DOT64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; DOT64-NEXT: vmv.v.i v9, 0
; DOT64-NEXT: lui a0, 4112
; DOT64-NEXT: addiw a0, a0, 257
; DOT64-NEXT: vqdotu.vx v9, v8, a0
; DOT64-NEXT: vmv.s.x v8, zero
; DOT64-NEXT: vredsum.vs v8, v9, v8
; DOT64-NEXT: vmv.x.s a0, v8
; DOT64-NEXT: ret
entry:
%a.ext = zext <16 x i8> %a to <16 x i32>
%res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a.ext)
ret i32 %res
}