[RISCV] Expand zvqdotq partial.reduce test variants

Make sure to cover all the scalable types which are legal, plus
splitting.  Make sure to cover all instructions.  Not duplicating
vx testing at this time.
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
index 3408445..5eb649d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll
@@ -523,8 +523,53 @@
 }
 
 
-define <vscale x 4 x i32> @vqdot_vv_partial_reduce(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: vqdot_vv_partial_reduce:
+define <vscale x 1 x i32> @partial_reduce_nf2(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
+; CHECK-LABEL: partial_reduce_nf2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vsext.vf2 v11, v9
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vwmul.vv v8, v10, v11
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vadd.vv v9, v11, v9
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
+  %b.sext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
+  %mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
+  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  ret <vscale x 1 x i32> %res
+}
+
+define <vscale x 2 x i32> @partial_reduce_m1(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: partial_reduce_m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vsext.vf2 v14, v9
+; CHECK-NEXT:    vwmul.vv v8, v12, v14
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v11, v8
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %mul = mul nuw nsw <vscale x 8 x i32> %a.sext, %b.sext
+  %res = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
+  ret <vscale x 2 x i32> %res
+}
+
+define <vscale x 4 x i32> @partial_reduce_m2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: partial_reduce_m2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v8
@@ -543,8 +588,178 @@
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 4 x i32> @vqdot_vv_partial_reduce2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
-; CHECK-LABEL: vqdot_vv_partial_reduce2:
+define <vscale x 8 x i32> @partial_reduce_m4(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-LABEL: partial_reduce_m4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsext.vf2 v24, v8
+; CHECK-NEXT:    vsext.vf2 v16, v10
+; CHECK-NEXT:    vsext.vf2 v28, v12
+; CHECK-NEXT:    vsext.vf2 v20, v14
+; CHECK-NEXT:    vwmul.vv v8, v16, v20
+; CHECK-NEXT:    vwmul.vv v16, v24, v28
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v16, v20, v16
+; CHECK-NEXT:    vadd.vv v8, v12, v8
+; CHECK-NEXT:    vadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 32 x i8> %a to <vscale x 32 x i32>
+  %b.sext = sext <vscale x 32 x i8> %b to <vscale x 32 x i32>
+  %mul = mul nuw nsw <vscale x 32 x i32> %a.sext, %b.sext
+  %res = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 16 x i32> @partial_reduce_m8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
+; CHECK-LABEL: partial_reduce_m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsext.vf2 v24, v10
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
+; CHECK-NEXT:    vsext.vf2 v0, v8
+; CHECK-NEXT:    vsext.vf2 v8, v18
+; CHECK-NEXT:    vsext.vf2 v4, v16
+; CHECK-NEXT:    vwmul.vv v24, v0, v4
+; CHECK-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; CHECK-NEXT:    vwmacc.vv v24, v16, v8
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    vsext.vf2 v16, v20
+; CHECK-NEXT:    vwmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsext.vf2 v8, v14
+; CHECK-NEXT:    vsext.vf2 v12, v22
+; CHECK-NEXT:    vwmacc.vv v24, v8, v12
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 64 x i8> %a to <vscale x 64 x i32>
+  %b.sext = sext <vscale x 64 x i8> %b to <vscale x 64 x i32>
+  %mul = mul nuw nsw <vscale x 64 x i32> %a.sext, %b.sext
+  %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 32 x i32> @partial_reduce_m16(<vscale x 128 x i8> %a, <vscale x 128 x i8> %b) {
+; CHECK-LABEL: partial_reduce_m16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsext.vf2 v4, v8
+; CHECK-NEXT:    vsext.vf2 v0, v16
+; CHECK-NEXT:    vwmul.vv v24, v4, v0
+; CHECK-NEXT:    vsext.vf2 v4, v10
+; CHECK-NEXT:    vsext.vf2 v8, v18
+; CHECK-NEXT:    vwmacc.vv v24, v4, v8
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vsext.vf2 v0, v12
+; CHECK-NEXT:    vl8r.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v4, v20
+; CHECK-NEXT:    vwmacc.vv v24, v0, v4
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v20, v0
+; CHECK-NEXT:    vsext.vf2 v16, v8
+; CHECK-NEXT:    vwmul.vv v0, v20, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v20, v18
+; CHECK-NEXT:    vsext.vf2 v16, v10
+; CHECK-NEXT:    vwmacc.vv v0, v20, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v8, v20
+; CHECK-NEXT:    vsext.vf2 v16, v12
+; CHECK-NEXT:    vwmacc.vv v0, v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v8, v22
+; CHECK-NEXT:    vsext.vf2 v16, v14
+; CHECK-NEXT:    vwmacc.vv v0, v8, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v8, v14
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsext.vf2 v12, v22
+; CHECK-NEXT:    vwmacc.vv v24, v8, v12
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmv8r.v v16, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 128 x i8> %a to <vscale x 128 x i32>
+  %b.sext = sext <vscale x 128 x i8> %b to <vscale x 128 x i32>
+  %mul = mul nuw nsw <vscale x 128 x i32> %a.sext, %b.sext
+  %res = call <vscale x 32 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
+  ret <vscale x 32 x i32> %res
+}
+
+define <vscale x 4 x i32> @partial_reduce_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
+; CHECK-LABEL: partial_reduce_accum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v24, v8
@@ -564,8 +779,8 @@
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 16 x i32> @vqdot_vv_partial_reduce3(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: vqdot_vv_partial_reduce3:
+define <vscale x 16 x i32> @partial_reduce_via_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: partial_reduce_via_accum:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v8
@@ -579,3 +794,53 @@
   %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
   ret <vscale x 16 x i32> %res
 }
+
+define <vscale x 1 x i32> @partial_reduce_vqdotu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
+; CHECK-LABEL: partial_reduce_vqdotu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vwmulu.vv v10, v8, v9
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vadd.vv v9, v11, v9
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
+  %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
+  %mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
+  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  ret <vscale x 1 x i32> %res
+}
+
+define <vscale x 1 x i32> @partial_reduce_vqdotsu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
+; CHECK-LABEL: partial_reduce_vqdotsu:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsext.vf2 v10, v8
+; CHECK-NEXT:    vzext.vf2 v11, v9
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vwmulsu.vv v8, v10, v11
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    vadd.vv v9, v11, v9
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
+  %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
+  %mul = mul nuw nsw <vscale x 4 x i32> %a.sext, %b.sext
+  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
+  ret <vscale x 1 x i32> %res
+}