blob: b443e654350c5eae09b84dfb9803d85616dfdab2 [file] [log] [blame]
Farhana Aleen3528c802018-08-21 16:21:15 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Matt Arsenault5a3299a2024-11-26 12:59:15 -05002; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10-DL %s
Farhana Aleen3528c802018-08-21 16:21:15 +00008
9; add(mul(S0.x, S1.y),
10; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
11
Nikita Popovbdf2fbb2022-12-19 12:39:01 +010012define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000013; GFX7-LABEL: udot2:
14; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -050015; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
16; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
17; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +000018; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -050019; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +000020; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050021; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +000022; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
23; GFX7-NEXT: v_mov_b32_e32 v1, 0
24; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -050025; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +000026; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -050027; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
28; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +000029; GFX7-NEXT: s_waitcnt vmcnt(1)
30; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +010031; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +000032; GFX7-NEXT: s_waitcnt vmcnt(0)
33; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +010034; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000035; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050036; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +000037; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Shilei Tian6548b632024-11-08 20:21:16 -050038; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000039; GFX7-NEXT: s_endpgm
40;
41; GFX8-LABEL: udot2:
42; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -050043; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
44; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +000045; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000046; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050047; GFX8-NEXT: v_mov_b32_e32 v1, s1
48; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +000049; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
50; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -050051; GFX8-NEXT: v_mov_b32_e32 v1, s3
52; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +000053; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -050055; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +000056; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +010057; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +000058; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
59; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +010060; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +000061; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000062; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050063; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +000064; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -050065; GFX8-NEXT: v_mov_b32_e32 v0, s4
66; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000067; GFX8-NEXT: flat_store_dword v[0:1], v2
68; GFX8-NEXT: s_endpgm
69;
70; GFX9-NODL-LABEL: udot2:
71; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -050072; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
73; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +000074; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
75; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050076; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
77; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
78; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -050079; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +000080; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
81; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
82; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000083; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050084; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
85; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000086; GFX9-NODL-NEXT: s_endpgm
87;
88; GFX9-DL-LABEL: udot2:
89; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -050090; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
91; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +000092; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000093; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050094; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
95; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
96; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Austin Kerbowda067ed2021-11-10 09:59:31 -080097; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +000098; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050099; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
100; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000101; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000102;
103; GFX10-DL-LABEL: udot2:
104; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400105; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500106; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
107; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000108; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000109; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000110; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500111; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
112; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
113; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
114; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400115; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000116; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500117; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
118; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000119; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100120 ptr addrspace(1) %src2,
121 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000122entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000123 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100124 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
125 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
126 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
127 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000128
129 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
130 %conv = zext i16 %s1.elt1 to i32
131 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
132 %conv2 = zext i16 %s2.elt1 to i32
133 %mul1 = mul nuw i32 %conv2, %conv
134
135 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
136 %conv3 = zext i16 %s1.elt2 to i32
137 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
138 %conv4 = zext i16 %s2.elt2 to i32
139 %mul2 = mul nuw i32 %conv4, %conv3
140
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100141 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000142 %add = add i32 %mul2, %s3
143 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100144 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000145 ret void
146}
147
148; TODO: Support this pattern
149; add(S3,
150; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100151define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000152; GFX7-LABEL: udot2_MulMul:
153; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +0700154; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
155; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
156; GFX7-NEXT: s_mov_b32 s7, 0xf000
157; GFX7-NEXT: s_mov_b32 s10, 0
158; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000159; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +0700160; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000161; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
162; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +0700163; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
164; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
165; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
166; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
167; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000168; GFX7-NEXT: s_waitcnt vmcnt(1)
169; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +0100170; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000171; GFX7-NEXT: s_waitcnt vmcnt(0)
172; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +0100173; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000174; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2
175; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000176; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +0700177; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
178; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000179; GFX7-NEXT: s_endpgm
180;
181; GFX8-LABEL: udot2_MulMul:
182; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500183; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
184; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000185; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000186; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500187; GFX8-NEXT: v_mov_b32_e32 v1, s1
188; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000189; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
190; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500191; GFX8-NEXT: v_mov_b32_e32 v1, s3
192; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000193; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500195; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000196; GFX8-NEXT: s_waitcnt vmcnt(1)
197; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
198; GFX8-NEXT: s_waitcnt vmcnt(0)
199; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
200; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
201; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000202; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500203; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
204; GFX8-NEXT: v_mov_b32_e32 v0, s4
205; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000206; GFX8-NEXT: flat_store_dword v[0:1], v2
207; GFX8-NEXT: s_endpgm
208;
209; GFX9-NODL-LABEL: udot2_MulMul:
210; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500211; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
212; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000213; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500215; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
216; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
217; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500218; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000219; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
220; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
221; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000222; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500223; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0
224; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000225; GFX9-NODL-NEXT: s_endpgm
226;
227; GFX9-DL-LABEL: udot2_MulMul:
228; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500229; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
230; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000231; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
232; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500233; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
234; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
235; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500236; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000237; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
238; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
239; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000240; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500241; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0
242; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000243; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000244;
245; GFX10-DL-LABEL: udot2_MulMul:
246; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400247; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500248; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
249; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000250; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000251; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000252; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500253; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
254; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
255; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
256; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000257; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
258; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
259; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
260; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Jay Foad0412f512019-12-17 16:09:02 +0000261; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500262; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
263; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000264; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100265 ptr addrspace(1) %src2,
266 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000267entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000268 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100269 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
270 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
271 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
272 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000273
274 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
275 %conv = zext i16 %s1.elt1 to i32
276 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
277 %conv2 = zext i16 %s2.elt1 to i32
278 %mul1 = mul nuw i32 %conv2, %conv
279
280 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
281 %conv3 = zext i16 %s1.elt2 to i32
282 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
283 %conv4 = zext i16 %s2.elt2 to i32
284 %mul2 = mul nuw i32 %conv4, %conv3
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100285 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000286 %add = add i32 %mul2, %mul1
287 %add6 = add i32 %add, %s3
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100288 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000289 ret void
290}
291
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100292define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000293; GFX7-LABEL: idot2:
294; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500295; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
296; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
297; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000298; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -0500299; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000300; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500301; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000302; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303; GFX7-NEXT: v_mov_b32_e32 v1, 0
304; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500305; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000306; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500307; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
308; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000309; GFX7-NEXT: s_waitcnt vmcnt(1)
310; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
311; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
312; GFX7-NEXT: s_waitcnt vmcnt(0)
313; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
314; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000315; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500316; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000317; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500318; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000319; GFX7-NEXT: s_endpgm
320;
321; GFX8-LABEL: idot2:
322; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500323; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
324; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000325; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000326; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500327; GFX8-NEXT: v_mov_b32_e32 v1, s1
328; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000329; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
330; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500331; GFX8-NEXT: v_mov_b32_e32 v1, s3
332; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000333; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
334; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500335; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000336; GFX8-NEXT: s_waitcnt vmcnt(1)
337; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
338; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
339; GFX8-NEXT: s_waitcnt vmcnt(0)
340; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
341; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Jay Foad0412f512019-12-17 16:09:02 +0000342; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500343; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000344; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500345; GFX8-NEXT: v_mov_b32_e32 v0, s4
346; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000347; GFX8-NEXT: flat_store_dword v[0:1], v2
348; GFX8-NEXT: s_endpgm
349;
350; GFX9-NODL-LABEL: idot2:
351; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500352; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
353; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000354; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500356; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
357; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
358; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500359; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000360; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
361; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
362; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000363; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500364; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
365; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000366; GFX9-NODL-NEXT: s_endpgm
367;
368; GFX9-DL-LABEL: idot2:
369; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500370; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
371; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000372; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000373; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500374; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
375; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
376; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800377; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000378; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500379; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
380; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000381; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000382;
383; GFX10-DL-LABEL: idot2:
384; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400385; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500386; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
387; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000388; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000389; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000390; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500391; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
392; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
393; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
394; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400395; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000396; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500397; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
398; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000399; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100400 ptr addrspace(1) %src2,
401 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000402entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000403 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100404 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
405 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
406 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
407 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000408
409 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
410 %conv = sext i16 %s1.elt1 to i32
411 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
412 %conv2 = sext i16 %s2.elt1 to i32
413 %mul1 = mul nuw i32 %conv2, %conv
414
415 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
416 %conv3 = sext i16 %s1.elt2 to i32
417 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
418 %conv4 = sext i16 %s2.elt2 to i32
419 %mul2 = mul nuw i32 %conv4, %conv3
420
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100421 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000422 %add = add i32 %mul2, %s3
423 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100424 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000425 ret void
426}
427
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100428define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000429; GFX7-LABEL: idot2_MixedTypedMul:
430; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500431; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
432; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
433; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000434; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -0500435; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000436; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500437; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000438; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
439; GFX7-NEXT: v_mov_b32_e32 v1, 0
440; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500441; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000442; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500443; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
444; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000445; GFX7-NEXT: s_waitcnt vmcnt(1)
446; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
447; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
448; GFX7-NEXT: s_waitcnt vmcnt(0)
449; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
450; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000451; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500452; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000453; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1
Shilei Tian6548b632024-11-08 20:21:16 -0500454; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000455; GFX7-NEXT: s_endpgm
456;
457; GFX8-LABEL: idot2_MixedTypedMul:
458; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500459; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
460; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000461; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000462; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500463; GFX8-NEXT: v_mov_b32_e32 v1, s1
464; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000465; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
466; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500467; GFX8-NEXT: v_mov_b32_e32 v1, s3
468; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000469; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
470; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500471; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000472; GFX8-NEXT: s_waitcnt vmcnt(1)
473; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
474; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
475; GFX8-NEXT: s_waitcnt vmcnt(0)
476; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
477; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Jay Foad0412f512019-12-17 16:09:02 +0000478; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500479; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000480; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500481; GFX8-NEXT: v_mov_b32_e32 v0, s4
482; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000483; GFX8-NEXT: flat_store_dword v[0:1], v2
484; GFX8-NEXT: s_endpgm
485;
486; GFX9-NODL-LABEL: idot2_MixedTypedMul:
487; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500488; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
489; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000490; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
491; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500492; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
493; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
494; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500495; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000496; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
497; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
498; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000499; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500500; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
501; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000502; GFX9-NODL-NEXT: s_endpgm
503;
504; GFX9-DL-LABEL: idot2_MixedTypedMul:
505; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500506; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
507; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000508; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
509; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500510; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
511; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
512; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500513; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000514; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
515; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
516; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000517; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500518; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
519; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000520; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000521;
522; GFX10-DL-LABEL: idot2_MixedTypedMul:
523; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400524; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500525; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
526; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000527; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000528; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000529; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500530; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
531; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
532; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
533; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000534; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
535; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
536; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
537; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000538; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500539; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
540; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000541; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100542 ptr addrspace(1) %src2,
543 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000544entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000545 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100546 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
547 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
548 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
549 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000550
551 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
552 %conv = sext i16 %s1.elt1 to i32
553 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
554 %conv2 = sext i16 %s2.elt1 to i32
555 %mul1 = mul nuw i32 %conv2, %conv
556
557 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
558 %conv3 = zext i16 %s1.elt2 to i32
559 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
560 %conv4 = zext i16 %s2.elt2 to i32
561 %mul2 = mul nuw i32 %conv4, %conv3
562
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100563 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000564 %add = add i32 %mul2, %s3
565 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100566 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000567 ret void
568}
569
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100570define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000571; GFX7-LABEL: udot2_alt_AddOperands:
572; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500573; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
574; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
575; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000576; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -0500577; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000578; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500579; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000580; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
581; GFX7-NEXT: v_mov_b32_e32 v1, 0
582; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500583; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000584; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500585; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
586; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000587; GFX7-NEXT: s_waitcnt vmcnt(1)
588; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +0100589; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000590; GFX7-NEXT: s_waitcnt vmcnt(0)
591; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +0100592; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000593; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500594; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000595; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Shilei Tian6548b632024-11-08 20:21:16 -0500596; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000597; GFX7-NEXT: s_endpgm
598;
599; GFX8-LABEL: udot2_alt_AddOperands:
600; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500601; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
602; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000603; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000604; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500605; GFX8-NEXT: v_mov_b32_e32 v1, s1
606; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000607; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
608; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500609; GFX8-NEXT: v_mov_b32_e32 v1, s3
610; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000611; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
612; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500613; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000614; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +0100615; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +0000616; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
617; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +0100618; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000619; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000620; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500621; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000622; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500623; GFX8-NEXT: v_mov_b32_e32 v0, s4
624; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000625; GFX8-NEXT: flat_store_dword v[0:1], v2
626; GFX8-NEXT: s_endpgm
627;
628; GFX9-NODL-LABEL: udot2_alt_AddOperands:
629; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500630; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
631; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000632; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000633; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500634; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
635; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
636; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500637; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000638; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +0100639; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000640; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +0100641; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000642; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
643; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000644; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500645; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000646; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
Shilei Tian6548b632024-11-08 20:21:16 -0500647; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000648; GFX9-NODL-NEXT: s_endpgm
649;
650; GFX9-DL-LABEL: udot2_alt_AddOperands:
651; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500652; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
653; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000654; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000655; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500656; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
657; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
658; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800659; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000660; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500661; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
662; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000663; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000664;
665; GFX10-DL-LABEL: udot2_alt_AddOperands:
666; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400667; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500668; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
669; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000670; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000671; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000672; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500673; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
674; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
675; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
676; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400677; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000678; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500679; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
680; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000681; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100682 ptr addrspace(1) %src2,
683 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000684entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000685 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100686 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
687 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
688 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
689 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000690
691 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
692 %conv = zext i16 %s1.elt1 to i32
693 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
694 %conv2 = zext i16 %s2.elt1 to i32
695 %mul1 = mul nuw i32 %conv2, %conv
696
697 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
698 %conv3 = zext i16 %s1.elt2 to i32
699 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
700 %conv4 = zext i16 %s2.elt2 to i32
701 %mul2 = mul nuw i32 %conv4, %conv3
702
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100703 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000704 %add = add i32 %s3, %mul2
705 %add6 = add i32 %mul1, %add
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100706 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000707 ret void
708}
709
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100710define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000711; GFX7-LABEL: idot2_MixedExt:
712; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500713; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
714; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
715; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000716; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -0500717; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000718; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500719; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000720; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
721; GFX7-NEXT: v_mov_b32_e32 v1, 0
722; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500723; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000724; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500725; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
726; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000727; GFX7-NEXT: s_waitcnt vmcnt(1)
728; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
729; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
730; GFX7-NEXT: s_waitcnt vmcnt(0)
731; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
732; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000733; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500734; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000735; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500736; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000737; GFX7-NEXT: s_endpgm
738;
739; GFX8-LABEL: idot2_MixedExt:
740; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500741; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
742; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000743; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000744; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500745; GFX8-NEXT: v_mov_b32_e32 v1, s1
746; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000747; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
748; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500749; GFX8-NEXT: v_mov_b32_e32 v1, s3
750; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000751; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
752; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500753; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000754; GFX8-NEXT: s_waitcnt vmcnt(1)
755; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
756; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
757; GFX8-NEXT: s_waitcnt vmcnt(0)
758; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
759; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Jay Foad0412f512019-12-17 16:09:02 +0000760; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500761; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000762; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500763; GFX8-NEXT: v_mov_b32_e32 v0, s4
764; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000765; GFX8-NEXT: flat_store_dword v[0:1], v2
766; GFX8-NEXT: s_endpgm
767;
768; GFX9-NODL-LABEL: idot2_MixedExt:
769; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500770; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
771; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000772; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
773; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500774; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
775; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
776; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500777; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000778; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
779; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
780; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000781; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500782; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
783; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000784; GFX9-NODL-NEXT: s_endpgm
785;
786; GFX9-DL-LABEL: idot2_MixedExt:
787; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500788; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
789; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000790; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
791; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500792; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
793; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
794; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500795; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000796; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
797; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
798; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000799; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500800; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
801; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000802; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000803;
804; GFX10-DL-LABEL: idot2_MixedExt:
805; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400806; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500807; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
808; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000809; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000810; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000811; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500812; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
813; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
814; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
815; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000816; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
817; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
818; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
819; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000820; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500821; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
822; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000823; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100824 ptr addrspace(1) %src2,
825 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000826entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000827 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100828 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
829 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
830 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
831 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000832
833 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
834 %conv = sext i16 %s1.elt1 to i32
835 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
836 %conv2 = zext i16 %s2.elt1 to i32
837 %mul1 = mul nuw i32 %conv2, %conv
838
839 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
840 %conv3 = sext i16 %s1.elt2 to i32
841 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
842 %conv4 = sext i16 %s2.elt2 to i32
843 %mul2 = mul nuw i32 %conv4, %conv3
844
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100845 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000846 %add = add i32 %mul2, %s3
847 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100848 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000849 ret void
850}
851
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100852define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000853; GFX7-LABEL: notudot2_SameVec:
854; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500855; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
856; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
857; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000858; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -0500859; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +0000860; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500861; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000862; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
863; GFX7-NEXT: v_mov_b32_e32 v1, 0
864; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500865; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +0000866; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -0500867; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
868; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +0000869; GFX7-NEXT: s_waitcnt vmcnt(1)
870; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
871; GFX7-NEXT: s_waitcnt vmcnt(0)
872; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000873; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500874; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000875; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500876; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000877; GFX7-NEXT: s_endpgm
878;
879; GFX8-LABEL: notudot2_SameVec:
880; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500881; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
882; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000883; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000884; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500885; GFX8-NEXT: v_mov_b32_e32 v1, s1
886; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +0000887; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Austin Kerbowda067ed2021-11-10 09:59:31 -0800888; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500889; GFX8-NEXT: v_mov_b32_e32 v1, s3
890; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Austin Kerbowda067ed2021-11-10 09:59:31 -0800891; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Jay Foadfdaa2d02021-02-19 15:04:03 +0000892; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -0500893; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000894; GFX8-NEXT: s_waitcnt vmcnt(1)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800895; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +0000896; GFX8-NEXT: s_waitcnt vmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800897; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Jay Foad43830792019-10-07 15:33:59 +0100898; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500899; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800900; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -0500901; GFX8-NEXT: v_mov_b32_e32 v0, s4
902; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000903; GFX8-NEXT: flat_store_dword v[0:1], v2
904; GFX8-NEXT: s_endpgm
905;
906; GFX9-NODL-LABEL: notudot2_SameVec:
907; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500908; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
909; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000910; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
911; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500912; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
913; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
914; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500915; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000916; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
917; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
918; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
919; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000920; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500921; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1
922; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000923; GFX9-NODL-NEXT: s_endpgm
924;
925; GFX9-DL-LABEL: notudot2_SameVec:
926; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500927; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
928; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000929; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
930; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500931; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
932; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
933; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -0500934; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000935; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
936; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
937; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
938; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000939; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500940; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1
941; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000942; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000943;
944; GFX10-DL-LABEL: notudot2_SameVec:
945; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +0400946; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500947; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
948; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +0000949; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000950; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +0000951; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -0500952; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
953; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
954; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
955; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +0000956; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
957; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
958; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
959; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
960; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Jay Foad0412f512019-12-17 16:09:02 +0000961; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500962; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
963; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +0000964; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100965 ptr addrspace(1) %src2,
966 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +0000967entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +0000968 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100969 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
970 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
971 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
972 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +0000973
974 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
975 %conv = zext i16 %s1.elt1 to i32
976 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
977 %conv2 = zext i16 %s2.elt1 to i32
978 %mul1 = mul i32 %conv2, %conv
979
980 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
981 %conv3 = zext i16 %s1.elt2 to i32
982 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
983 %conv4 = zext i16 %s2.elt2 to i32
984 %mul2 = mul i32 %conv4, %conv3
985
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100986 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000987 %add = add i32 %mul2, %s3
988 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100989 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +0000990 ret void
991}
992
Nikita Popovbdf2fbb2022-12-19 12:39:01 +0100993define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000994; GFX7-LABEL: udot2_v4i16:
995; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -0500996; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
997; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
998; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +0000999; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001000; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001001; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001002; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001003; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1004; GFX7-NEXT: v_mov_b32_e32 v1, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001005; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1006; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001007; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05001008; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1009; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1010; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001011; GFX7-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001012; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001013; GFX7-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001014; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001015; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1016; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001017; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001018; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001019; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001020; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001021; GFX7-NEXT: s_endpgm
1022;
1023; GFX8-LABEL: udot2_v4i16:
1024; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001025; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1026; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001027; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001028; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001029; GFX8-NEXT: v_mov_b32_e32 v1, s1
1030; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001031; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Shilei Tian6548b632024-11-08 20:21:16 -05001032; GFX8-NEXT: v_mov_b32_e32 v3, s3
1033; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001034; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1035; GFX8-NEXT: flat_load_dword v0, v[0:1]
1036; GFX8-NEXT: flat_load_dword v1, v[2:3]
Shilei Tian6548b632024-11-08 20:21:16 -05001037; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001038; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001039; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001040; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001041; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001042; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1043; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001044; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001045; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001046; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001047; GFX8-NEXT: v_mov_b32_e32 v0, s4
1048; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001049; GFX8-NEXT: flat_store_dword v[0:1], v2
1050; GFX8-NEXT: s_endpgm
1051;
1052; GFX9-NODL-LABEL: udot2_v4i16:
1053; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001054; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1055; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001056; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1057; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001058; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1059; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1060; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001061; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001062; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1063; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1064; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001065; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001066; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1067; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001068; GFX9-NODL-NEXT: s_endpgm
1069;
1070; GFX9-DL-LABEL: udot2_v4i16:
1071; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001072; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1073; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001074; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001075; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001076; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1077; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1078; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Austin Kerbowda067ed2021-11-10 09:59:31 -08001079; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001080; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001081; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1082; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001083; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001084;
1085; GFX10-DL-LABEL: udot2_v4i16:
1086; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001087; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001088; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1089; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001090; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001091; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00001092; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001093; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1094; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1095; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1096; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001097; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001098; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001099; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1100; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001101; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001102 ptr addrspace(1) %src2,
1103 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001104entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001105 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001106 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1107 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1108 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1109 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001110
1111 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1112 %conv = zext i16 %s1.elt1 to i32
1113 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1114 %conv2 = zext i16 %s2.elt1 to i32
1115 %mul1 = mul i32 %conv2, %conv
1116
1117 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1118 %conv3 = zext i16 %s1.elt2 to i32
1119 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1120 %conv4 = zext i16 %s2.elt2 to i32
1121 %mul2 = mul i32 %conv4, %conv3
1122
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001123 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001124 %add = add i32 %mul2, %s3
1125 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001126 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001127 ret void
1128}
1129
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001130define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001131; GFX7-LABEL: udot2_v4i16_Hi:
1132; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001133; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1134; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1135; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00001136; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1137; GFX7-NEXT: v_mov_b32_e32 v1, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001138; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001139; GFX7-NEXT: s_mov_b32 s11, s7
Joe Nash3ce1b962021-09-08 13:22:15 -04001140; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001141; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001142; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
Shilei Tian6548b632024-11-08 20:21:16 -05001143; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001144; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
Shilei Tian6548b632024-11-08 20:21:16 -05001145; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1146; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001147; GFX7-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001148; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001149; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1150; GFX7-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001151; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001152; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001153; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001154; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001155; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001156; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001157; GFX7-NEXT: s_endpgm
1158;
1159; GFX8-LABEL: udot2_v4i16_Hi:
1160; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001161; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1162; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001163; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001164; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001165; GFX8-NEXT: v_mov_b32_e32 v1, s1
1166; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001167; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Shilei Tian6548b632024-11-08 20:21:16 -05001168; GFX8-NEXT: v_mov_b32_e32 v3, s3
1169; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001170; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1171; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
1172; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1173; GFX8-NEXT: flat_load_dword v2, v[0:1]
1174; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
1175; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1176; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001177; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001178; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001179; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001180; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1181; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001182; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001183; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001184; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001185; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001186; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001187; GFX8-NEXT: v_mov_b32_e32 v0, s4
1188; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001189; GFX8-NEXT: flat_store_dword v[0:1], v2
1190; GFX8-NEXT: s_endpgm
1191;
1192; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1193; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001194; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1195; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001196; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1197; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001198; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1199; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1200; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001201; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001202; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1203; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1204; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001205; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001206; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1207; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001208; GFX9-NODL-NEXT: s_endpgm
1209;
1210; GFX9-DL-LABEL: udot2_v4i16_Hi:
1211; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001212; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1213; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001214; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001215; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001216; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1217; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1218; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Austin Kerbowda067ed2021-11-10 09:59:31 -08001219; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001220; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001221; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1222; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001223; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001224;
1225; GFX10-DL-LABEL: udot2_v4i16_Hi:
1226; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001227; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001228; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1229; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001230; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001231; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00001232; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001233; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4
1234; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4
1235; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1236; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001237; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001238; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001239; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
1240; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001241; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001242 ptr addrspace(1) %src2,
1243 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001244entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001245 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001246 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1247 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1248 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1249 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001250
1251 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1252 %conv = zext i16 %s1.elt1 to i32
1253 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1254 %conv2 = zext i16 %s2.elt1 to i32
1255 %mul1 = mul i32 %conv2, %conv
1256
1257 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1258 %conv3 = zext i16 %s1.elt2 to i32
1259 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1260 %conv4 = zext i16 %s2.elt2 to i32
1261 %mul2 = mul i32 %conv4, %conv3
1262
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001263 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001264 %add = add i32 %mul2, %s3
1265 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001266 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001267 ret void
1268}
1269
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001270define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001271; GFX7-LABEL: notudot2_v4i16_Even:
1272; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001273; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1274; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1275; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00001276; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001277; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001278; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001279; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001280; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1281; GFX7-NEXT: v_mov_b32_e32 v1, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001282; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1283; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001284; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05001285; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1286; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1287; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001288; GFX7-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001289; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
Austin Kerbowda067ed2021-11-10 09:59:31 -08001290; GFX7-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001291; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
1292; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
1293; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001294; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001295; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001296; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001297; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001298; GFX7-NEXT: s_endpgm
1299;
1300; GFX8-LABEL: notudot2_v4i16_Even:
1301; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001302; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1303; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001304; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001305; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001306; GFX8-NEXT: v_mov_b32_e32 v1, s1
1307; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001308; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Shilei Tian6548b632024-11-08 20:21:16 -05001309; GFX8-NEXT: v_mov_b32_e32 v3, s3
1310; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001311; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1312; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1313; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
Shilei Tian6548b632024-11-08 20:21:16 -05001314; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001315; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001316; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001317; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001318; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
1319; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
1320; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001321; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001322; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001323; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001324; GFX8-NEXT: v_mov_b32_e32 v0, s4
1325; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001326; GFX8-NEXT: flat_store_dword v[0:1], v2
1327; GFX8-NEXT: s_endpgm
1328;
1329; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1330; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001331; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1332; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001333; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001334; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001335; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1336; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1337; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001338; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
1339; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1340; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1341; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001342; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001343; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
1344; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001345; GFX9-NODL-NEXT: s_endpgm
1346;
1347; GFX9-DL-LABEL: notudot2_v4i16_Even:
1348; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001349; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1350; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001351; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001352; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001353; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1354; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1355; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001356; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
1357; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1358; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1359; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001360; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001361; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1362; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001363; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001364;
1365; GFX10-DL-LABEL: notudot2_v4i16_Even:
1366; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001367; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001368; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1369; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001370; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001371; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00001372; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001373; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1374; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1375; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1376; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001377; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1378; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1379; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1380; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Jay Foad0412f512019-12-17 16:09:02 +00001381; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001382; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1383; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001384; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001385 ptr addrspace(1) %src2,
1386 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001387entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001388 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001389 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1390 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1391 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1392 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001393
1394 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1395 %conv = zext i16 %s1.elt1 to i32
1396 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1397 %conv2 = zext i16 %s2.elt1 to i32
1398 %mul1 = mul i32 %conv2, %conv
1399
1400 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1401 %conv3 = zext i16 %s1.elt2 to i32
1402 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1403 %conv4 = zext i16 %s2.elt2 to i32
1404 %mul2 = mul i32 %conv4, %conv3
1405
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001406 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001407 %add = add i32 %mul2, %s3
1408 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001409 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001410 ret void
1411}
1412
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001413define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001414; GFX7-LABEL: notudot2_v4i16_Middle:
1415; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001416; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1417; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1418; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00001419; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001420; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001421; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001422; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001423; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1424; GFX7-NEXT: v_mov_b32_e32 v1, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001425; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
1426; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001427; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05001428; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1429; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1430; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001431; GFX7-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001432; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00001433; GFX7-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001434; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001435; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1436; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001437; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001438; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001439; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001440; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001441; GFX7-NEXT: s_endpgm
1442;
1443; GFX8-LABEL: notudot2_v4i16_Middle:
1444; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001445; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1446; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001447; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001448; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001449; GFX8-NEXT: v_mov_b32_e32 v1, s1
1450; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001451; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Shilei Tian6548b632024-11-08 20:21:16 -05001452; GFX8-NEXT: v_mov_b32_e32 v3, s3
1453; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001454; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1455; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1456; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
Shilei Tian6548b632024-11-08 20:21:16 -05001457; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001458; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001459; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001460; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001461; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00001462; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1463; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001464; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001465; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001466; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001467; GFX8-NEXT: v_mov_b32_e32 v0, s4
1468; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001469; GFX8-NEXT: flat_store_dword v[0:1], v2
1470; GFX8-NEXT: s_endpgm
1471;
1472; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1473; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001474; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1475; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001476; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001477; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001478; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1479; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1480; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001481; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0
1482; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1483; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1484; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001485; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001486; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0
1487; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001488; GFX9-NODL-NEXT: s_endpgm
1489;
1490; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1491; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001492; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1493; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001494; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001495; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001496; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1497; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1498; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001499; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0
1500; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1501; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1502; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001503; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001504; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1505; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001506; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001507;
1508; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1509; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001510; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001511; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1512; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001513; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001514; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00001515; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001516; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
1517; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
1518; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1519; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001520; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1521; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1522; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1523; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Jay Foad0412f512019-12-17 16:09:02 +00001524; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001525; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1526; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001527; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001528 ptr addrspace(1) %src2,
1529 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001530entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001531 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001532 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1533 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1534 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1535 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001536
1537 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1538 %conv = zext i16 %s1.elt1 to i32
1539 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1540 %conv2 = zext i16 %s2.elt1 to i32
1541 %mul1 = mul i32 %conv2, %conv
1542
1543 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1544 %conv3 = zext i16 %s1.elt2 to i32
1545 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1546 %conv4 = zext i16 %s2.elt2 to i32
1547 %mul2 = mul i32 %conv4, %conv3
1548
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001549 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001550 %add = add i32 %mul2, %s3
1551 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001552 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001553 ret void
1554}
1555
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001556define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001557; GFX7-LABEL: notudot2_DiffIndex:
1558; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001559; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1560; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1561; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00001562; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05001563; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001564; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001565; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001566; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1567; GFX7-NEXT: v_mov_b32_e32 v1, 0
1568; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05001569; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001570; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05001571; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1572; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001573; GFX7-NEXT: s_waitcnt vmcnt(1)
1574; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +01001575; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001576; GFX7-NEXT: s_waitcnt vmcnt(0)
1577; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +01001578; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001579; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001580; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001581; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001582; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001583; GFX7-NEXT: s_endpgm
1584;
1585; GFX8-LABEL: notudot2_DiffIndex:
1586; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001587; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1588; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001589; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001590; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001591; GFX8-NEXT: v_mov_b32_e32 v1, s1
1592; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001593; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001595; GFX8-NEXT: v_mov_b32_e32 v1, s3
1596; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001597; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1598; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001599; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001600; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001601; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00001602; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1603; GFX8-NEXT: s_waitcnt vmcnt(0)
1604; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
Jay Foade2926502022-05-16 15:53:03 +01001605; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001606; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001607; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001608; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001609; GFX8-NEXT: v_mov_b32_e32 v0, s4
1610; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001611; GFX8-NEXT: flat_store_dword v[0:1], v2
1612; GFX8-NEXT: s_endpgm
1613;
1614; GFX9-NODL-LABEL: notudot2_DiffIndex:
1615; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001616; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1617; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001618; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1619; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001620; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1621; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1622; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001623; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001624; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1625; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1626; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001627; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001628; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
1629; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001630; GFX9-NODL-NEXT: s_endpgm
1631;
1632; GFX9-DL-LABEL: notudot2_DiffIndex:
1633; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001634; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1635; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001636; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1637; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001638; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1639; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1640; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001641; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001642; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1643; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1644; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001645; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001646; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3
1647; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001648; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001649;
1650; GFX10-DL-LABEL: notudot2_DiffIndex:
1651; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001652; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001653; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1654; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001655; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001656; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00001657; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001658; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1659; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1660; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1661; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001662; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1663; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1664; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1665; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001666; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001667; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
1668; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001669; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001670 ptr addrspace(1) %src2,
1671 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001672entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001673 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001674 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1675 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1676 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1677 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001678
1679 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1680 %conv = zext i16 %s1.elt1 to i32
1681 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1682 %conv2 = zext i16 %s2.elt1 to i32
1683 %mul1 = mul i32 %conv2, %conv
1684
1685 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1686 %conv3 = zext i16 %s1.elt2 to i32
1687 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1688 %conv4 = zext i16 %s2.elt2 to i32
1689 %mul2 = mul i32 %conv4, %conv3
1690
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001691 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001692 %add = add i32 %mul2, %s3
1693 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001694 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001695 ret void
1696}
1697
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001698define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001699; GFX7-LABEL: udot2_MultipleUses_add1:
1700; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001701; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1702; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1703; GFX7-NEXT: s_mov_b32 s7, 0xf000
1704; GFX7-NEXT: s_mov_b32 s10, 0
1705; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001706; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001707; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001708; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1709; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001710; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1711; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
1712; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1713; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1714; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001715; GFX7-NEXT: s_waitcnt vmcnt(1)
1716; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +01001717; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001718; GFX7-NEXT: s_waitcnt vmcnt(0)
1719; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +01001720; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001721; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001722; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001723; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001724; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001725; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001726; GFX7-NEXT: s_endpgm
1727;
1728; GFX8-LABEL: udot2_MultipleUses_add1:
1729; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001730; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1731; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001732; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001733; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001734; GFX8-NEXT: v_mov_b32_e32 v1, s1
1735; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001736; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1737; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001738; GFX8-NEXT: v_mov_b32_e32 v1, s3
1739; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001740; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1741; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001742; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001743; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01001744; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00001745; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1746; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01001747; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001748; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001749; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001750; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001751; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
1752; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001753; GFX8-NEXT: v_mov_b32_e32 v0, s4
1754; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001755; GFX8-NEXT: flat_store_dword v[0:1], v2
1756; GFX8-NEXT: s_endpgm
1757;
1758; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1759; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001760; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1761; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001762; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1763; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001764; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1765; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1766; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001767; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001768; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1769; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1770; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1771; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001772; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001773; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001774; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001775; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001776; GFX9-NODL-NEXT: s_endpgm
1777;
1778; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1779; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001780; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1781; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001782; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1783; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001784; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1785; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1786; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001787; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001788; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1789; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1790; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1791; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001792; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001793; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001794; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001795; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001796; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001797;
1798; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1799; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001800; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001801; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1802; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001803; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001804; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1805; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001806; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1807; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1808; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1809; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001810; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1811; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1812; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1813; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1814; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001815; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001816; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001817; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001818; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001819; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001820; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001821 ptr addrspace(1) %src2,
1822 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001823entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001824 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001825 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1826 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1827 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1828 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001829
1830 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1831 %conv = zext i16 %s1.elt1 to i32
1832 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1833 %conv2 = zext i16 %s2.elt1 to i32
1834 %mul1 = mul i32 %conv2, %conv
1835
1836 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1837 %conv3 = zext i16 %s1.elt2 to i32
1838 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1839 %conv4 = zext i16 %s2.elt2 to i32
1840 %mul2 = mul i32 %conv4, %conv3
1841
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001842 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001843 %add1 = add i32 %mul2, %s3
1844 %add2 = add i32 %add1, %mul1
1845
1846 %res = add i32 %add2, %add1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001847 store i32 %res, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001848 ret void
1849}
1850
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001851define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001852; GFX7-LABEL: idot2_MultipleUses_add1:
1853; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001854; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1855; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1856; GFX7-NEXT: s_mov_b32 s7, 0xf000
1857; GFX7-NEXT: s_mov_b32 s10, 0
1858; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00001859; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001860; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00001861; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1862; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001863; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1864; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
1865; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1866; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
1867; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00001868; GFX7-NEXT: s_waitcnt vmcnt(1)
1869; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
1870; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
1871; GFX7-NEXT: s_waitcnt vmcnt(0)
1872; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
1873; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001874; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001875; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001876; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0
1877; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07001878; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001879; GFX7-NEXT: s_endpgm
1880;
1881; GFX8-LABEL: idot2_MultipleUses_add1:
1882; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001883; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1884; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001885; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001886; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001887; GFX8-NEXT: v_mov_b32_e32 v1, s1
1888; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001889; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1890; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001891; GFX8-NEXT: v_mov_b32_e32 v1, s3
1892; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00001893; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1894; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05001895; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001896; GFX8-NEXT: s_waitcnt vmcnt(1)
1897; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
1898; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
1899; GFX8-NEXT: s_waitcnt vmcnt(0)
1900; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
1901; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Jay Foad0412f512019-12-17 16:09:02 +00001902; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001903; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001904; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
1905; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001906; GFX8-NEXT: v_mov_b32_e32 v0, s4
1907; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001908; GFX8-NEXT: flat_store_dword v[0:1], v2
1909; GFX8-NEXT: s_endpgm
1910;
1911; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1912; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001913; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1914; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001915; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1916; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001917; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
1918; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
1919; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001920; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001921; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1922; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1923; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
1924; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001925; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001926; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001927; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001928; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001929; GFX9-NODL-NEXT: s_endpgm
1930;
1931; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1932; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05001933; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1934; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001935; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1936; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001937; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
1938; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
1939; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001940; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001941; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1942; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1943; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
1944; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001945; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001946; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001947; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
Shilei Tian6548b632024-11-08 20:21:16 -05001948; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001949; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001950;
1951; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1952; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04001953; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001954; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1955; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00001956; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001957; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1958; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05001959; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
1960; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
1961; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
1962; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001963; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
1964; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
1965; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1966; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
1967; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05001968; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001969; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05001970; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00001971; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05001972; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00001973; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001974 ptr addrspace(1) %src2,
1975 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00001976entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00001977 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001978 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1979 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1980 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1981 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00001982
1983 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1984 %conv = sext i16 %s1.elt1 to i32
1985 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1986 %conv2 = sext i16 %s2.elt1 to i32
1987 %mul1 = mul i32 %conv2, %conv
1988
1989 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1990 %conv3 = sext i16 %s1.elt2 to i32
1991 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1992 %conv4 = sext i16 %s2.elt2 to i32
1993 %mul2 = mul i32 %conv4, %conv3
1994
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01001995 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00001996 %add1 = add i32 %mul2, %s3
1997 %add2 = add i32 %add1, %mul1
1998
1999 %res = add i32 %add2, %add1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002000 store i32 %res, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002001 ret void
2002}
2003
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002004define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002005; GFX7-LABEL: udot2_MultipleUses_mul1:
2006; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002007; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2008; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2009; GFX7-NEXT: s_mov_b32 s7, 0xf000
2010; GFX7-NEXT: s_mov_b32 s10, 0
2011; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00002012; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002013; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002014; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2015; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002016; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2017; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2018; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2019; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2020; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002021; GFX7-NEXT: s_waitcnt vmcnt(1)
2022; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +01002023; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002024; GFX7-NEXT: s_waitcnt vmcnt(0)
2025; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Jay Foade2926502022-05-16 15:53:03 +01002026; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002027; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002028; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002029; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
2030; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002031; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002032; GFX7-NEXT: s_endpgm
2033;
2034; GFX8-LABEL: udot2_MultipleUses_mul1:
2035; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002036; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2037; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002038; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002039; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002040; GFX8-NEXT: v_mov_b32_e32 v1, s1
2041; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002042; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2043; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002044; GFX8-NEXT: v_mov_b32_e32 v1, s3
2045; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002046; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2047; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002048; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002049; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01002050; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00002051; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2052; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01002053; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002054; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002055; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002056; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002057; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
2058; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002059; GFX8-NEXT: v_mov_b32_e32 v0, s4
2060; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002061; GFX8-NEXT: flat_store_dword v[0:1], v2
2062; GFX8-NEXT: s_endpgm
2063;
2064; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
2065; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002066; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2067; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002068; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002069; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002070; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2071; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2072; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002073; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002074; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01002075; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002076; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01002077; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002078; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2079; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002080; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002081; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002082; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002083; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002084; GFX9-NODL-NEXT: s_endpgm
2085;
2086; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
2087; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002088; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2089; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002090; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002091; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002092; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2093; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2094; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002095; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002096; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01002097; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002098; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01002099; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002100; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2101; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002102; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002103; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002104; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002105; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002106; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002107;
2108; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2109; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04002110; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002111; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2112; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002113; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002114; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002115; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002116; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2117; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2118; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2119; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002120; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
Jay Foad3eb22812022-05-16 15:48:11 +01002121; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002122; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
Jay Foad3eb22812022-05-16 15:48:11 +01002123; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002124; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2125; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
Jay Foad0412f512019-12-17 16:09:02 +00002126; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002127; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002128; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2129; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002130; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002131; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002132 ptr addrspace(1) %src2,
2133 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002134entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002135 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002136 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2137 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2138 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2139 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002140
2141 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2142 %conv = zext i16 %s1.elt1 to i32
2143 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2144 %conv2 = zext i16 %s2.elt1 to i32
2145 %mul1 = mul i32 %conv2, %conv
2146
2147 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2148 %conv3 = zext i16 %s1.elt2 to i32
2149 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2150 %conv4 = zext i16 %s2.elt2 to i32
2151 %mul2 = mul i32 %conv4, %conv3
2152
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002153 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002154 %add0 = add i32 %mul1, %s3
2155
2156 %add1 = add i32 %mul2, %add0
2157 %add2 = add i32 %add1, %mul1
2158
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002159 store i32 %add2, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002160 ret void
2161}
2162
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002163define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002164; GFX7-LABEL: idot2_MultipleUses_mul1:
2165; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002166; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2167; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2168; GFX7-NEXT: s_mov_b32 s7, 0xf000
2169; GFX7-NEXT: s_mov_b32 s10, 0
2170; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00002171; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002172; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002173; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2174; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002175; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2176; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2177; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2178; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2179; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002180; GFX7-NEXT: s_waitcnt vmcnt(1)
2181; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
2182; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2183; GFX7-NEXT: s_waitcnt vmcnt(0)
2184; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
2185; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002186; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002187; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002188; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
2189; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002190; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002191; GFX7-NEXT: s_endpgm
2192;
2193; GFX8-LABEL: idot2_MultipleUses_mul1:
2194; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002195; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2196; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002197; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002198; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002199; GFX8-NEXT: v_mov_b32_e32 v1, s1
2200; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002201; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2202; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002203; GFX8-NEXT: v_mov_b32_e32 v1, s3
2204; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002205; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2206; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002207; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002208; GFX8-NEXT: s_waitcnt vmcnt(1)
2209; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
2210; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
2211; GFX8-NEXT: s_waitcnt vmcnt(0)
2212; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
2213; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
2214; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002215; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002216; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
2217; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002218; GFX8-NEXT: v_mov_b32_e32 v0, s4
2219; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002220; GFX8-NEXT: flat_store_dword v[0:1], v2
2221; GFX8-NEXT: s_endpgm
2222;
2223; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2224; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002225; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2226; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002227; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2228; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002229; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2230; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2231; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002232; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002233; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2234; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16
2235; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2236; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16
2237; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2238; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002239; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002240; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002241; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002242; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002243; GFX9-NODL-NEXT: s_endpgm
2244;
2245; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2246; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002247; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2248; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002249; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2250; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002251; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2252; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2253; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002254; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002255; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2256; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16
2257; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2258; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16
2259; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2260; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002261; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002262; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002263; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002264; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002265; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002266;
2267; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2268; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04002269; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002270; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2271; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002272; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002273; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002274; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002275; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2276; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2277; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2278; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002279; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2280; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16
2281; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2282; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 16
2283; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2284; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002285; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002286; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002287; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2288; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
Shilei Tian6548b632024-11-08 20:21:16 -05002289; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002290; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002291 ptr addrspace(1) %src2,
2292 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002293entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002294 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002295 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2296 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2297 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2298 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002299
2300 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2301 %conv = sext i16 %s1.elt1 to i32
2302 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2303 %conv2 = sext i16 %s2.elt1 to i32
2304 %mul1 = mul i32 %conv2, %conv
2305
2306 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2307 %conv3 = sext i16 %s1.elt2 to i32
2308 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2309 %conv4 = sext i16 %s2.elt2 to i32
2310 %mul2 = mul i32 %conv4, %conv3
2311
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002312 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002313 %add0 = add i32 %mul1, %s3
2314
2315 %add1 = add i32 %mul2, %add0
2316 %add2 = add i32 %add1, %mul1
2317
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002318 store i32 %add2, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002319 ret void
2320}
2321
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002322define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002323; GFX7-LABEL: udot2_MultipleUses_mul2:
2324; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002325; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2326; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2327; GFX7-NEXT: s_mov_b32 s7, 0xf000
2328; GFX7-NEXT: s_mov_b32 s10, 0
2329; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00002330; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002331; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002332; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2333; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002334; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2335; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2336; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2337; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2338; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002339; GFX7-NEXT: s_waitcnt vmcnt(1)
2340; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
Jay Foade2926502022-05-16 15:53:03 +01002341; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002342; GFX7-NEXT: s_waitcnt vmcnt(0)
2343; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002344; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002345; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0
Jay Foade2926502022-05-16 15:53:03 +01002346; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002347; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
2348; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002349; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002350; GFX7-NEXT: s_endpgm
2351;
2352; GFX8-LABEL: udot2_MultipleUses_mul2:
2353; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002354; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2355; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002356; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002357; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002358; GFX8-NEXT: v_mov_b32_e32 v1, s1
2359; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002360; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2361; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002362; GFX8-NEXT: v_mov_b32_e32 v1, s3
2363; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002364; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2365; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002366; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002367; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foade2926502022-05-16 15:53:03 +01002368; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00002369; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2370; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foade2926502022-05-16 15:53:03 +01002371; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002372; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002373; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002374; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002375; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
2376; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002377; GFX8-NEXT: v_mov_b32_e32 v0, s4
2378; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002379; GFX8-NEXT: flat_store_dword v[0:1], v2
2380; GFX8-NEXT: s_endpgm
2381;
2382; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2383; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002384; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2385; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002386; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2387; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002388; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2389; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2390; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002391; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002392; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2393; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2394; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2395; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2396; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002397; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002398; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002399; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
Shilei Tian6548b632024-11-08 20:21:16 -05002400; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002401; GFX9-NODL-NEXT: s_endpgm
2402;
2403; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2404; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002405; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2406; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002407; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2408; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002409; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2410; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2411; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002412; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002413; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2414; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2415; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2416; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2417; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002418; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002419; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002420; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
Shilei Tian6548b632024-11-08 20:21:16 -05002421; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002422; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002423;
2424; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2425; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04002426; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002427; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2428; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002429; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002430; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002431; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002432; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2433; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2434; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2435; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002436; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2437; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
2438; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2439; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2440; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2441; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
Austin Kerbow2291bd12020-11-30 09:06:35 -08002442; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002443; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002444; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2445; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
Shilei Tian6548b632024-11-08 20:21:16 -05002446; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002447; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002448 ptr addrspace(1) %src2,
2449 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002450entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002451 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002452 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2453 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2454 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2455 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002456
2457 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2458 %conv = zext i16 %s1.elt1 to i32
2459 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2460 %conv2 = zext i16 %s2.elt1 to i32
2461 %mul1 = mul i32 %conv2, %conv
2462
2463 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2464 %conv3 = zext i16 %s1.elt2 to i32
2465 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2466 %conv4 = zext i16 %s2.elt2 to i32
2467 %mul2 = mul i32 %conv4, %conv3
2468
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002469 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002470 %add0 = add i32 %mul2, %s3
2471
2472 %add1 = add i32 %mul2, %add0
2473 %add2 = add i32 %add1, %mul1
2474
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002475 store i32 %add2, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002476 ret void
2477}
2478
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002479define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002480; GFX7-LABEL: idot2_MultipleUses_mul2:
2481; GFX7: ; %bb.0: ; %entry
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002482; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2483; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2484; GFX7-NEXT: s_mov_b32 s7, 0xf000
2485; GFX7-NEXT: s_mov_b32 s10, 0
2486; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00002487; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002488; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002489; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2490; GFX7-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002491; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2492; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
2493; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2494; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2495; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002496; GFX7-NEXT: s_waitcnt vmcnt(1)
2497; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16
2498; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2499; GFX7-NEXT: s_waitcnt vmcnt(0)
2500; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
2501; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002502; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002503; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002504; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
2505; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
Matt Arsenaultd21fc582025-02-07 12:31:14 +07002506; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002507; GFX7-NEXT: s_endpgm
2508;
2509; GFX8-LABEL: idot2_MultipleUses_mul2:
2510; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002511; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2512; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002513; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002514; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002515; GFX8-NEXT: v_mov_b32_e32 v1, s1
2516; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002517; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2518; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002519; GFX8-NEXT: v_mov_b32_e32 v1, s3
2520; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002521; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2522; GFX8-NEXT: flat_load_dword v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002523; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002524; GFX8-NEXT: s_waitcnt vmcnt(1)
2525; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
2526; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
2527; GFX8-NEXT: s_waitcnt vmcnt(0)
2528; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
2529; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
Jay Foad0412f512019-12-17 16:09:02 +00002530; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002531; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002532; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
2533; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002534; GFX8-NEXT: v_mov_b32_e32 v0, s4
2535; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002536; GFX8-NEXT: flat_store_dword v[0:1], v2
2537; GFX8-NEXT: s_endpgm
2538;
2539; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2540; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002541; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2542; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002543; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2544; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002545; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
2546; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
2547; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002548; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002549; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2550; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2551; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2552; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2553; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002554; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002555; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002556; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
Shilei Tian6548b632024-11-08 20:21:16 -05002557; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002558; GFX9-NODL-NEXT: s_endpgm
2559;
2560; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2561; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002562; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2563; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002564; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2565; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002566; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
2567; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
2568; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002569; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002570; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2571; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2572; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
2573; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
2574; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002575; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002576; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002577; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
Shilei Tian6548b632024-11-08 20:21:16 -05002578; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002579; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002580;
2581; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2582; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04002583; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002584; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2585; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002586; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002587; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002588; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002589; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
2590; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
2591; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2592; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002593; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2594; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
2595; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2596; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
2597; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2598; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002599; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002600; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002601; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
2602; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
Shilei Tian6548b632024-11-08 20:21:16 -05002603; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002604; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002605 ptr addrspace(1) %src2,
2606 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002607entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002608 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002609 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2610 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2611 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2612 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002613
2614 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2615 %conv = sext i16 %s1.elt1 to i32
2616 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2617 %conv2 = sext i16 %s2.elt1 to i32
2618 %mul1 = mul i32 %conv2, %conv
2619
2620 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2621 %conv3 = sext i16 %s1.elt2 to i32
2622 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2623 %conv4 = sext i16 %s2.elt2 to i32
2624 %mul2 = mul i32 %conv4, %conv3
2625
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002626 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002627 %add0 = add i32 %mul2, %s3
2628
2629 %add1 = add i32 %mul2, %add0
2630 %add2 = add i32 %add1, %mul1
2631
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002632 store i32 %add2, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002633 ret void
2634}
2635
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002636define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002637; GFX7-LABEL: udot2_acc16:
2638; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002639; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2640; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2641; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00002642; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05002643; GFX7-NEXT: s_mov_b32 s11, s7
Jay Foadfdaa2d02021-02-19 15:04:03 +00002644; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002645; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002646; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2647; GFX7-NEXT: v_mov_b32_e32 v1, 0
2648; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05002649; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Joe Nash3ce1b962021-09-08 13:22:15 -04002650; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05002651; GFX7-NEXT: s_mov_b32 s6, -1
2652; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002653; GFX7-NEXT: s_waitcnt vmcnt(2)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002654; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
Jay Foade2926502022-05-16 15:53:03 +01002655; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002656; GFX7-NEXT: s_waitcnt vmcnt(1)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002657; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
Jay Foade2926502022-05-16 15:53:03 +01002658; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002659; GFX7-NEXT: s_waitcnt vmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002660; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002661; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
Shilei Tian6548b632024-11-08 20:21:16 -05002662; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002663; GFX7-NEXT: s_endpgm
2664;
2665; GFX8-LABEL: udot2_acc16:
2666; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002667; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2668; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002669; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002670; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002671; GFX8-NEXT: v_mov_b32_e32 v1, s1
2672; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002673; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Austin Kerbowda067ed2021-11-10 09:59:31 -08002674; GFX8-NEXT: flat_load_dword v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002675; GFX8-NEXT: v_mov_b32_e32 v1, s3
2676; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002677; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Austin Kerbowda067ed2021-11-10 09:59:31 -08002678; GFX8-NEXT: flat_load_dword v2, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002679; GFX8-NEXT: v_mov_b32_e32 v0, s4
2680; GFX8-NEXT: v_mov_b32_e32 v1, s5
Austin Kerbowda067ed2021-11-10 09:59:31 -08002681; GFX8-NEXT: flat_load_ushort v4, v[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002682; GFX8-NEXT: s_waitcnt vmcnt(2)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002683; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
Jay Foadfdaa2d02021-02-19 15:04:03 +00002684; GFX8-NEXT: s_waitcnt vmcnt(1)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002685; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002686; GFX8-NEXT: s_waitcnt vmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002687; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4
2688; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
2689; GFX8-NEXT: flat_store_short v[0:1], v2
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002690; GFX8-NEXT: s_endpgm
2691;
2692; GFX9-NODL-LABEL: udot2_acc16:
2693; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002694; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2695; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002696; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2697; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002698; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002699; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1]
2700; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3]
2701; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[6:7]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002702; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
2703; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
2704; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002705; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
Austin Kerbow2291bd12020-11-30 09:06:35 -08002706; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002707; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
Jay Foadfdaa2d02021-02-19 15:04:03 +00002708; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002709; GFX9-NODL-NEXT: global_store_short v1, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002710; GFX9-NODL-NEXT: s_endpgm
2711;
2712; GFX9-DL-LABEL: udot2_acc16:
2713; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002714; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2715; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002716; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2717; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002718; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002719; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
2720; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
2721; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002722; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
2723; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
2724; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002725; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002726; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08002727; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4
Jay Foadfdaa2d02021-02-19 15:04:03 +00002728; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002729; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002730; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002731;
2732; GFX10-DL-LABEL: udot2_acc16:
2733; GFX10-DL: ; %bb.0: ; %entry
Jay Foadfdaa2d02021-02-19 15:04:03 +00002734; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002735; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2736; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002737; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2738; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002739; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002740; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002741; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
2742; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
2743; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002744; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
Baptiste Saleilcaf12942021-04-26 15:48:12 -04002745; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002746; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2747; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2748; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
Baptiste Saleilcaf12942021-04-26 15:48:12 -04002749; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
Jay Foadfdaa2d02021-02-19 15:04:03 +00002750; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002751; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002752; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002753 ptr addrspace(1) %src2,
2754 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002755entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002756 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002757 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2758 %v1 = load <2 x i16>, ptr addrspace(1) %gep1
2759 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2760 %v2 = load <2 x i16>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002761
2762 %v1e1 = extractelement <2 x i16> %v1, i64 0
2763 %v2e1 = extractelement <2 x i16> %v2, i64 0
2764 %mul1 = mul i16 %v1e1, %v2e1
2765
2766 %v1e2 = extractelement <2 x i16> %v1, i64 1
2767 %v2e2 = extractelement <2 x i16> %v2, i64 1
2768 %mul2 = mul i16 %v1e2, %v2e2
2769
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002770 %s2 = load i16, ptr addrspace(1) %dst, align 2
Farhana Aleen3528c802018-08-21 16:21:15 +00002771 %add1 = add i16 %mul2, %s2
2772 %add2 = add i16 %add1, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002773 store i16 %add2, ptr addrspace(1) %dst, align 2
Farhana Aleen3528c802018-08-21 16:21:15 +00002774 ret void
2775}
2776
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002777define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002778; GFX7-LABEL: notsdot2_sext8:
2779; GFX7: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002780; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2781; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2782; GFX7-NEXT: s_mov_b32 s7, 0xf000
Jay Foadfdaa2d02021-02-19 15:04:03 +00002783; GFX7-NEXT: s_mov_b32 s10, 0
Shilei Tian6548b632024-11-08 20:21:16 -05002784; GFX7-NEXT: s_mov_b32 s11, s7
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002785; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002786; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002787; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2788; GFX7-NEXT: v_mov_b32_e32 v1, 0
2789; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05002790; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
Jay Foadfdaa2d02021-02-19 15:04:03 +00002791; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
Shilei Tian6548b632024-11-08 20:21:16 -05002792; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
2793; GFX7-NEXT: s_mov_b32 s6, -1
Jay Foadfdaa2d02021-02-19 15:04:03 +00002794; GFX7-NEXT: s_waitcnt vmcnt(1)
2795; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
2796; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
2797; GFX7-NEXT: s_waitcnt vmcnt(0)
2798; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8
2799; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
2800; GFX7-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002801; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002802; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002803; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002804; GFX7-NEXT: s_endpgm
2805;
2806; GFX8-LABEL: notsdot2_sext8:
2807; GFX8: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002808; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2809; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002810; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002811; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002812; GFX8-NEXT: v_mov_b32_e32 v1, s1
2813; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002814; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2815; GFX8-NEXT: flat_load_ushort v3, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002816; GFX8-NEXT: v_mov_b32_e32 v1, s3
2817; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002818; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Jay Foad0412f512019-12-17 16:09:02 +00002819; GFX8-NEXT: flat_load_ushort v0, v[0:1]
Shilei Tian6548b632024-11-08 20:21:16 -05002820; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
Tony1bc7bff2020-10-16 07:09:38 +00002821; GFX8-NEXT: s_waitcnt vmcnt(1)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002822; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
2823; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2824; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
2825; GFX8-NEXT: s_waitcnt vmcnt(0)
Jay Foad62fd7f72020-01-07 15:43:46 +00002826; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
Stanislav Mekhanoshin71ed66d2020-05-12 14:18:53 -07002827; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
Jay Foad0412f512019-12-17 16:09:02 +00002828; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
Tony1bc7bff2020-10-16 07:09:38 +00002829; GFX8-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002830; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002831; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002832; GFX8-NEXT: v_mov_b32_e32 v0, s4
2833; GFX8-NEXT: v_mov_b32_e32 v1, s5
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002834; GFX8-NEXT: flat_store_dword v[0:1], v2
2835; GFX8-NEXT: s_endpgm
2836;
2837; GFX9-NODL-LABEL: notsdot2_sext8:
2838; GFX9-NODL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002839; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2840; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002841; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002842; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002843; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
2844; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3]
2845; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
Jay Foadfdaa2d02021-02-19 15:04:03 +00002846; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002847; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
Jay Foadfdaa2d02021-02-19 15:04:03 +00002848; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2849; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002850; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
Jay Foadfdaa2d02021-02-19 15:04:03 +00002851; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
Jay Foad0412f512019-12-17 16:09:02 +00002852; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002853; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3
2854; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002855; GFX9-NODL-NEXT: s_endpgm
2856;
2857; GFX9-DL-LABEL: notsdot2_sext8:
2858; GFX9-DL: ; %bb.0: ; %entry
Shilei Tian6548b632024-11-08 20:21:16 -05002859; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2860; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002861; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002862; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002863; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
2864; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3]
2865; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
2866; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001
Jay Foadfdaa2d02021-02-19 15:04:03 +00002867; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
Jeffrey Byrnes7794e162023-08-28 15:44:23 -07002868; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
Shilei Tian6548b632024-11-08 20:21:16 -05002869; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002870; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002871; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
Jay Foad0412f512019-12-17 16:09:02 +00002872; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002873; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0
2874; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002875; GFX9-DL-NEXT: s_endpgm
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002876;
2877; GFX10-DL-LABEL: notsdot2_sext8:
2878; GFX10-DL: ; %bb.0: ; %entry
Matt Arsenaultb1bcb7c2024-07-15 09:59:07 +04002879; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002880; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2881; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
Jay Foadfdaa2d02021-02-19 15:04:03 +00002882; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
Jeffrey Byrnes7794e162023-08-28 15:44:23 -07002883; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002884; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002885; GFX10-DL-NEXT: s_clause 0x1
Shilei Tian6548b632024-11-08 20:21:16 -05002886; GFX10-DL-NEXT: global_load_ushort v1, v0, s[0:1]
2887; GFX10-DL-NEXT: global_load_ushort v2, v0, s[2:3]
2888; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
2889; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002890; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
Jeffrey Byrnes7794e162023-08-28 15:44:23 -07002891; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001
Matt Arsenaultd2e52ee2020-11-10 11:06:59 -05002892; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
Jeffrey Byrnes7794e162023-08-28 15:44:23 -07002893; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002894; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -05002895; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
Shilei Tiand63c2e52024-01-15 23:11:50 -05002896; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
Shilei Tian6548b632024-11-08 20:21:16 -05002897; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7]
Stanislav Mekhanoshine917b3b2019-06-20 16:29:40 +00002898; GFX10-DL-NEXT: s_endpgm
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002899 ptr addrspace(1) %src2,
2900 ptr addrspace(1) nocapture %dst) {
Farhana Aleen3528c802018-08-21 16:21:15 +00002901entry:
Jay Foadfdaa2d02021-02-19 15:04:03 +00002902 %idx = call i32 @llvm.amdgcn.workitem.id.x()
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002903 %gep1 = getelementptr <2 x i8>, ptr addrspace(1) %src1, i32 %idx
2904 %vec1 = load <2 x i8>, ptr addrspace(1) %gep1
2905 %gep2 = getelementptr <2 x i8>, ptr addrspace(1) %src2, i32 %idx
2906 %vec2 = load <2 x i8>, ptr addrspace(1) %gep2
Farhana Aleen3528c802018-08-21 16:21:15 +00002907
2908 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2909 %conv = sext i8 %s1.elt1 to i32
2910 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2911 %conv2 = sext i8 %s2.elt1 to i32
2912 %mul1 = mul nuw i32 %conv2, %conv
2913
2914 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2915 %conv3 = sext i8 %s1.elt2 to i32
2916 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2917 %conv4 = sext i8 %s2.elt2 to i32
2918 %mul2 = mul nuw i32 %conv4, %conv3
2919
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002920 %s3 = load i32, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002921 %add = add i32 %mul2, %s3
2922 %add6 = add i32 %add, %mul1
Nikita Popovbdf2fbb2022-12-19 12:39:01 +01002923 store i32 %add6, ptr addrspace(1) %dst, align 4
Farhana Aleen3528c802018-08-21 16:21:15 +00002924 ret void
2925}
Jay Foadfdaa2d02021-02-19 15:04:03 +00002926
2927declare i32 @llvm.amdgcn.workitem.id.x()