blob: 89876c62c4cd87cede23060056325ecbc8060b53 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s
; Test dot2 patterns with saturating add (clamp) and without
;------------------------------------------------------------------------------
; DOT2 SATURATING TESTS
;------------------------------------------------------------------------------
; Unsigned dot2 with saturation: uaddsat(a[0]*b[0] + a[1]*b[1], c)
define i32 @udot2_sat(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX9-DL-LABEL: udot2_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v1, v0, v2 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: udot2_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_u32_u16 v0, v1, v0, v2 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: udot2_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_u32_u16 v0, v1, v0, v2 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: udot2_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NODL-NEXT: v_add_u32_e64 v0, v0, v2 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv.i = zext <2 x i16> %a to <2 x i32>
%conv6.i = zext <2 x i16> %b to <2 x i32>
%mul.i = mul <2 x i32> %conv6.i, %conv.i
%e0 = extractelement <2 x i32> %mul.i, i64 0
%e1 = extractelement <2 x i32> %mul.i, i64 1
%add.i = add i32 %e0, %e1
%cond.i.i = tail call i32 @llvm.uadd.sat.i32(i32 %add.i, i32 %c)
ret i32 %cond.i.i
}
; Signed dot2 with saturation: saddsat(a[0]*b[0] + a[1]*b[1], c)
define i32 @sdot2_sat(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX9-DL-LABEL: sdot2_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_i32_i16 v0, v1, v0, v2 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: sdot2_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_i32_i16 v0, v1, v0, v2 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: sdot2_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_i32_i16 v0, v1, v0, v2 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: sdot2_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v0, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NODL-NEXT: v_add_i32 v0, v0, v2 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv.i = sext <2 x i16> %a to <2 x i32>
%conv6.i = sext <2 x i16> %b to <2 x i32>
%mul.i = mul <2 x i32> %conv6.i, %conv.i
%e0 = extractelement <2 x i32> %mul.i, i64 0
%e1 = extractelement <2 x i32> %mul.i, i64 1
%add.i = add i32 %e0, %e1
%cond1.i.i = tail call i32 @llvm.sadd.sat.i32(i32 %add.i, i32 %c)
ret i32 %cond1.i.i
}
;------------------------------------------------------------------------------
; DOT2 NON-SATURATING TESTS
;------------------------------------------------------------------------------
; Unsigned dot2 without saturation
define i32 @udot2_unsat(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX9-DL-LABEL: udot2_unsat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v1, v0, v2
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: udot2_unsat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_u32_u16 v0, v1, v0, v2
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: udot2_unsat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_u32_u16 v0, v1, v0, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: udot2_unsat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v2, v3
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv.i = zext <2 x i16> %a to <2 x i32>
%conv6.i = zext <2 x i16> %b to <2 x i32>
%mul.i = mul <2 x i32> %conv6.i, %conv.i
%e0 = extractelement <2 x i32> %mul.i, i64 0
%e1 = extractelement <2 x i32> %mul.i, i64 1
%add.i = add i32 %e1, %c
%add8.i = add i32 %add.i, %e0
ret i32 %add8.i
}
; Signed dot2 without saturation
define i32 @sdot2_unsat(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX9-DL-LABEL: sdot2_unsat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_i32_i16 v0, v1, v0, v2
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: sdot2_unsat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_i32_i16 v0, v1, v0, v2
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: sdot2_unsat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_i32_i16 v0, v1, v0, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: sdot2_unsat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v0, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v2, v3
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv.i = sext <2 x i16> %a to <2 x i32>
%conv6.i = sext <2 x i16> %b to <2 x i32>
%mul.i = mul <2 x i32> %conv6.i, %conv.i
%e0 = extractelement <2 x i32> %mul.i, i64 0
%e1 = extractelement <2 x i32> %mul.i, i64 1
%add.i = add i32 %e1, %c
%add8.i = add i32 %add.i, %e0
ret i32 %add8.i
}
;------------------------------------------------------------------------------
; DOT2 TESTS WITH I8 PROMOTION (i8 -> i16)
;------------------------------------------------------------------------------
; Unsigned dot2 with i8 inputs promoted to i16, with saturation
define i32 @udot2_i8_promoted_sat(<2 x i8> %a, <2 x i8> %b, i32 %c) {
; GFX9-DL-LABEL: udot2_i8_promoted_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-DL-NEXT: v_add_u32_e64 v0, v0, v4 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: udot2_i8_promoted_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-DL-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: udot2_i8_promoted_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_add_u32_e32 v0, v0, v1
; GFX950-NEXT: v_add_u32_e64 v0, v0, v4 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: udot2_i8_promoted_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NODL-NEXT: v_add_u32_e64 v0, v0, v4 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%a.ext16 = zext <2 x i8> %a to <2 x i16>
%b.ext16 = zext <2 x i8> %b to <2 x i16>
%a.ext32 = zext <2 x i16> %a.ext16 to <2 x i32>
%b.ext32 = zext <2 x i16> %b.ext16 to <2 x i32>
%mul = mul <2 x i32> %a.ext32, %b.ext32
%e0 = extractelement <2 x i32> %mul, i64 0
%e1 = extractelement <2 x i32> %mul, i64 1
%sum = add i32 %e0, %e1
%result = tail call i32 @llvm.uadd.sat.i32(i32 %sum, i32 %c)
ret i32 %result
}
; Signed dot2 with i8 inputs promoted to i16, with saturation
define i32 @sdot2_i8_promoted_sat(<2 x i8> %a, <2 x i8> %b, i32 %c) {
; GFX9-DL-LABEL: sdot2_i8_promoted_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-DL-NEXT: v_add_i32 v0, v0, v4 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: sdot2_i8_promoted_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-DL-NEXT: v_add_nc_i32 v0, v0, v4 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: sdot2_i8_promoted_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_add_u32_e32 v0, v0, v1
; GFX950-NEXT: v_add_i32 v0, v0, v4 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: sdot2_i8_promoted_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NODL-NEXT: v_add_i32 v0, v0, v4 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%a.ext16 = sext <2 x i8> %a to <2 x i16>
%b.ext16 = sext <2 x i8> %b to <2 x i16>
%a.ext32 = sext <2 x i16> %a.ext16 to <2 x i32>
%b.ext32 = sext <2 x i16> %b.ext16 to <2 x i32>
%mul = mul <2 x i32> %a.ext32, %b.ext32
%e0 = extractelement <2 x i32> %mul, i64 0
%e1 = extractelement <2 x i32> %mul, i64 1
%sum = add i32 %e0, %e1
%result = tail call i32 @llvm.sadd.sat.i32(i32 %sum, i32 %c)
ret i32 %result
}
; Unsigned dot2 with i8 inputs promoted to i16, without saturation
define i32 @udot2_i8_promoted_unsat(<2 x i8> %a, <2 x i8> %b, i32 %c) {
; GFX9-DL-LABEL: udot2_i8_promoted_unsat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: udot2_i8_promoted_unsat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: udot2_i8_promoted_unsat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: udot2_i8_promoted_unsat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%a.ext16 = zext <2 x i8> %a to <2 x i16>
%b.ext16 = zext <2 x i8> %b to <2 x i16>
%a.ext32 = zext <2 x i16> %a.ext16 to <2 x i32>
%b.ext32 = zext <2 x i16> %b.ext16 to <2 x i32>
%mul = mul <2 x i32> %a.ext32, %b.ext32
%e0 = extractelement <2 x i32> %mul, i64 0
%e1 = extractelement <2 x i32> %mul, i64 1
%sum = add i32 %e0, %e1
%result = add i32 %sum, %c
ret i32 %result
}
; Signed dot2 with i8 inputs promoted to i16, without saturation
define i32 @sdot2_i8_promoted_unsat(<2 x i8> %a, <2 x i8> %b, i32 %c) {
; GFX9-DL-LABEL: sdot2_i8_promoted_unsat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: sdot2_i8_promoted_unsat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: sdot2_i8_promoted_unsat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: sdot2_i8_promoted_unsat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%a.ext16 = sext <2 x i8> %a to <2 x i16>
%b.ext16 = sext <2 x i8> %b to <2 x i16>
%a.ext32 = sext <2 x i16> %a.ext16 to <2 x i32>
%b.ext32 = sext <2 x i16> %b.ext16 to <2 x i32>
%mul = mul <2 x i32> %a.ext32, %b.ext32
%e0 = extractelement <2 x i32> %mul, i64 0
%e1 = extractelement <2 x i32> %mul, i64 1
%sum = add i32 %e0, %e1
%result = add i32 %sum, %c
ret i32 %result
}
;------------------------------------------------------------------------------
; NEGATIVE TESTS
;------------------------------------------------------------------------------
; Negative test: wrong vector size (not <4 x i8>)
define i32 @dot_wrong_vec_size(<2 x i8> %a, <2 x i8> %b, i32 %c) {
; GFX9-DL-LABEL: dot_wrong_vec_size:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: dot_wrong_vec_size:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: dot_wrong_vec_size:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX950-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: dot_wrong_vec_size:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v1, v4
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
%a_ext = zext <2 x i8> %a to <2 x i32>
%b_ext = zext <2 x i8> %b to <2 x i32>
%mul = mul <2 x i32> %a_ext, %b_ext
%sum = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %mul)
%result = add i32 %sum, %c
ret i32 %result
}
;------------------------------------------------------------------------------
; SCALAR PACKED I16 SATURATING TESTS (exercise TableGen UDot2SatPat/SDot2SatPat)
;------------------------------------------------------------------------------
; Scalar packed i16 unsigned dot2 with saturation
; This tests the UDot2SatPat TableGen pattern which matches:
; uaddsat(add(mul_u24(srl $src0, 16), srl $src1, 16)), mul_u24(and $src0, 0xFFFF), ...))
define i32 @scalar_udot2_sat(i32 %a, i32 %b, i32 %c) {
; GFX9-DL-LABEL: scalar_udot2_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: scalar_udot2_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: scalar_udot2_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: scalar_udot2_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NODL-NEXT: v_add_u32_e64 v0, v0, v2 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
; Extract high 16 bits
%a_hi = lshr i32 %a, 16
%b_hi = lshr i32 %b, 16
; Extract low 16 bits
%a_lo = and i32 %a, 65535
%b_lo = and i32 %b, 65535
; Multiply
%mul_hi = mul i32 %a_hi, %b_hi
%mul_lo = mul i32 %a_lo, %b_lo
; Add products
%sum = add i32 %mul_hi, %mul_lo
; Saturating add with accumulator
%result = call i32 @llvm.uadd.sat.i32(i32 %sum, i32 %c)
ret i32 %result
}
; Scalar packed i16 signed dot2 with saturation
; This tests the SDot2SatPat TableGen pattern which matches:
; saddsat(add(mul_i24(sra $src0, 16), sra $src1, 16)), mul_i24(sext_inreg $src0, i16), ...))
define i32 @scalar_sdot2_sat(i32 %a, i32 %b, i32 %c) {
; GFX9-DL-LABEL: scalar_sdot2_sat:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX9-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DL-LABEL: scalar_sdot2_sat:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DL-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX10-DL-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: scalar_sdot2_sat:
; GFX950: ; %bb.0: ; %entry
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NODL-LABEL: scalar_sdot2_sat:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NODL-NEXT: v_add_i32 v0, v0, v2 clamp
; GFX9-NODL-NEXT: s_setpc_b64 s[30:31]
entry:
; Extract high 16 bits (arithmetic shift for sign extension)
%a_hi = ashr i32 %a, 16
%b_hi = ashr i32 %b, 16
; Extract low 16 bits (sign extend via shl+ashr)
%a_lo_shl = shl i32 %a, 16
%a_lo = ashr i32 %a_lo_shl, 16
%b_lo_shl = shl i32 %b, 16
%b_lo = ashr i32 %b_lo_shl, 16
; Multiply
%mul_hi = mul i32 %a_hi, %b_hi
%mul_lo = mul i32 %a_lo, %b_lo
; Add products
%sum = add i32 %mul_hi, %mul_lo
; Saturating add with accumulator
%result = call i32 @llvm.sadd.sat.i32(i32 %sum, i32 %c)
ret i32 %result
}
declare i32 @llvm.sadd.sat.i32(i32, i32)
declare i32 @llvm.uadd.sat.i32(i32, i32)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)