blob: fbcdbed338e60cf73902a94bd24bf25394101cb4 [file] [log] [blame]
Ivan Kosarev150c73a2023-06-14 11:40:48 +01001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
Fangrui Song9e9907f2024-01-16 21:54:58 -08002; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
Brox Chen8b23ebb2025-01-03 03:55:58 -05006; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +00007
Matt Arsenault177ff422022-11-29 17:49:58 -05008define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
Ivan Kosarev150c73a2023-06-14 11:40:48 +01009; SI-LABEL: test_fmax3_olt_0_f32:
10; SI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -050011; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
Ivan Kosarev150c73a2023-06-14 11:40:48 +010012; SI-NEXT: s_mov_b32 s11, 0xf000
13; SI-NEXT: s_mov_b32 s10, -1
14; SI-NEXT: s_mov_b32 s14, s10
15; SI-NEXT: s_mov_b32 s15, s11
16; SI-NEXT: s_mov_b32 s18, s10
17; SI-NEXT: s_mov_b32 s19, s11
18; SI-NEXT: s_mov_b32 s22, s10
19; SI-NEXT: s_mov_b32 s23, s11
20; SI-NEXT: s_waitcnt lgkmcnt(0)
21; SI-NEXT: s_mov_b32 s12, s2
22; SI-NEXT: s_mov_b32 s13, s3
23; SI-NEXT: s_mov_b32 s16, s4
24; SI-NEXT: s_mov_b32 s17, s5
25; SI-NEXT: s_mov_b32 s20, s6
26; SI-NEXT: s_mov_b32 s21, s7
27; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
28; SI-NEXT: s_waitcnt vmcnt(0)
29; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
30; SI-NEXT: s_waitcnt vmcnt(0)
31; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
32; SI-NEXT: s_waitcnt vmcnt(0)
33; SI-NEXT: s_mov_b32 s8, s0
34; SI-NEXT: s_mov_b32 s9, s1
35; SI-NEXT: v_max3_f32 v0, v0, v1, v2
36; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
37; SI-NEXT: s_endpgm
38;
39; VI-LABEL: test_fmax3_olt_0_f32:
40; VI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -050041; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +010042; VI-NEXT: s_mov_b32 s11, 0xf000
43; VI-NEXT: s_mov_b32 s10, -1
44; VI-NEXT: s_mov_b32 s14, s10
45; VI-NEXT: s_mov_b32 s15, s11
46; VI-NEXT: s_waitcnt lgkmcnt(0)
47; VI-NEXT: s_mov_b32 s12, s2
48; VI-NEXT: s_mov_b32 s13, s3
49; VI-NEXT: s_mov_b32 s16, s4
50; VI-NEXT: s_mov_b32 s17, s5
51; VI-NEXT: s_mov_b32 s18, s10
52; VI-NEXT: s_mov_b32 s19, s11
53; VI-NEXT: s_mov_b32 s4, s6
54; VI-NEXT: s_mov_b32 s5, s7
55; VI-NEXT: s_mov_b32 s6, s10
56; VI-NEXT: s_mov_b32 s7, s11
57; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
58; VI-NEXT: s_waitcnt vmcnt(0)
59; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
60; VI-NEXT: s_waitcnt vmcnt(0)
61; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
62; VI-NEXT: s_waitcnt vmcnt(0)
63; VI-NEXT: s_mov_b32 s8, s0
64; VI-NEXT: s_mov_b32 s9, s1
65; VI-NEXT: v_max3_f32 v0, v0, v1, v2
66; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
67; VI-NEXT: s_endpgm
68;
69; GFX9-LABEL: test_fmax3_olt_0_f32:
70; GFX9: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -050071; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
Christudasan Devadasan229e1182024-07-23 13:59:15 +053072; GFX9-NEXT: s_mov_b32 s3, 0xf000
73; GFX9-NEXT: s_mov_b32 s2, -1
Shilei Tian6548b632024-11-08 20:21:16 -050074; GFX9-NEXT: s_mov_b32 s6, s2
75; GFX9-NEXT: s_mov_b32 s7, s3
Shilei Tianca336492024-11-08 16:36:10 -050076; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050077; GFX9-NEXT: s_mov_b32 s4, s10
78; GFX9-NEXT: s_mov_b32 s5, s11
79; GFX9-NEXT: s_mov_b32 s16, s12
80; GFX9-NEXT: s_mov_b32 s17, s13
Shilei Tianca336492024-11-08 16:36:10 -050081; GFX9-NEXT: s_mov_b32 s18, s2
82; GFX9-NEXT: s_mov_b32 s19, s3
Shilei Tian6548b632024-11-08 20:21:16 -050083; GFX9-NEXT: s_mov_b32 s12, s14
84; GFX9-NEXT: s_mov_b32 s13, s15
85; GFX9-NEXT: s_mov_b32 s14, s2
86; GFX9-NEXT: s_mov_b32 s15, s3
87; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +010088; GFX9-NEXT: s_waitcnt vmcnt(0)
89; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
90; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050091; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +010092; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -050093; GFX9-NEXT: s_mov_b32 s0, s8
94; GFX9-NEXT: s_mov_b32 s1, s9
Ivan Kosarev150c73a2023-06-14 11:40:48 +010095; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
Christudasan Devadasan229e1182024-07-23 13:59:15 +053096; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +010097; GFX9-NEXT: s_endpgm
98;
99; GFX11-LABEL: test_fmax3_olt_0_f32:
100; GFX11: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500101; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100102; GFX11-NEXT: s_mov_b32 s10, -1
103; GFX11-NEXT: s_mov_b32 s11, 0x31016000
104; GFX11-NEXT: s_mov_b32 s14, s10
105; GFX11-NEXT: s_mov_b32 s15, s11
106; GFX11-NEXT: s_mov_b32 s18, s10
107; GFX11-NEXT: s_mov_b32 s19, s11
108; GFX11-NEXT: s_mov_b32 s22, s10
109; GFX11-NEXT: s_mov_b32 s23, s11
110; GFX11-NEXT: s_waitcnt lgkmcnt(0)
111; GFX11-NEXT: s_mov_b32 s12, s2
112; GFX11-NEXT: s_mov_b32 s13, s3
113; GFX11-NEXT: s_mov_b32 s16, s4
114; GFX11-NEXT: s_mov_b32 s17, s5
115; GFX11-NEXT: s_mov_b32 s20, s6
116; GFX11-NEXT: s_mov_b32 s21, s7
117; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
118; GFX11-NEXT: s_waitcnt vmcnt(0)
119; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
120; GFX11-NEXT: s_waitcnt vmcnt(0)
121; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
122; GFX11-NEXT: s_waitcnt vmcnt(0)
123; GFX11-NEXT: s_mov_b32 s8, s0
124; GFX11-NEXT: s_mov_b32 s9, s1
125; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
126; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100127; GFX11-NEXT: s_endpgm
Brox Chen8b23ebb2025-01-03 03:55:58 -0500128;
129; GFX12-LABEL: test_fmax3_olt_0_f32:
130; GFX12: ; %bb.0:
131; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
132; GFX12-NEXT: s_mov_b32 s10, -1
133; GFX12-NEXT: s_mov_b32 s11, 0x31016000
134; GFX12-NEXT: s_mov_b32 s14, s10
135; GFX12-NEXT: s_mov_b32 s15, s11
136; GFX12-NEXT: s_mov_b32 s18, s10
137; GFX12-NEXT: s_mov_b32 s19, s11
138; GFX12-NEXT: s_mov_b32 s22, s10
139; GFX12-NEXT: s_mov_b32 s23, s11
140; GFX12-NEXT: s_wait_kmcnt 0x0
141; GFX12-NEXT: s_mov_b32 s12, s2
142; GFX12-NEXT: s_mov_b32 s13, s3
143; GFX12-NEXT: s_mov_b32 s16, s4
144; GFX12-NEXT: s_mov_b32 s17, s5
145; GFX12-NEXT: s_mov_b32 s20, s6
146; GFX12-NEXT: s_mov_b32 s21, s7
147; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
148; GFX12-NEXT: s_wait_loadcnt 0x0
149; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
150; GFX12-NEXT: s_wait_loadcnt 0x0
151; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
152; GFX12-NEXT: s_wait_loadcnt 0x0
153; GFX12-NEXT: s_mov_b32 s8, s0
154; GFX12-NEXT: s_mov_b32 s9, s1
155; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
156; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
157; GFX12-NEXT: s_endpgm
Matt Arsenault177ff422022-11-29 17:49:58 -0500158 %a = load volatile float, ptr addrspace(1) %aptr, align 4
159 %b = load volatile float, ptr addrspace(1) %bptr, align 4
160 %c = load volatile float, ptr addrspace(1) %cptr, align 4
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000161 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
162 %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
Matt Arsenault177ff422022-11-29 17:49:58 -0500163 store float %f1, ptr addrspace(1) %out, align 4
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000164 ret void
165}
166
167; Commute operand of second fmax
Matt Arsenault177ff422022-11-29 17:49:58 -0500168define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100169; SI-LABEL: test_fmax3_olt_1_f32:
170; SI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500171; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100172; SI-NEXT: s_mov_b32 s11, 0xf000
173; SI-NEXT: s_mov_b32 s10, -1
174; SI-NEXT: s_mov_b32 s14, s10
175; SI-NEXT: s_mov_b32 s15, s11
176; SI-NEXT: s_mov_b32 s18, s10
177; SI-NEXT: s_mov_b32 s19, s11
178; SI-NEXT: s_mov_b32 s22, s10
179; SI-NEXT: s_mov_b32 s23, s11
180; SI-NEXT: s_waitcnt lgkmcnt(0)
181; SI-NEXT: s_mov_b32 s12, s2
182; SI-NEXT: s_mov_b32 s13, s3
183; SI-NEXT: s_mov_b32 s16, s4
184; SI-NEXT: s_mov_b32 s17, s5
185; SI-NEXT: s_mov_b32 s20, s6
186; SI-NEXT: s_mov_b32 s21, s7
187; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
188; SI-NEXT: s_waitcnt vmcnt(0)
189; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
190; SI-NEXT: s_waitcnt vmcnt(0)
191; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
192; SI-NEXT: s_waitcnt vmcnt(0)
193; SI-NEXT: s_mov_b32 s8, s0
194; SI-NEXT: s_mov_b32 s9, s1
195; SI-NEXT: v_max3_f32 v0, v2, v0, v1
196; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
197; SI-NEXT: s_endpgm
198;
199; VI-LABEL: test_fmax3_olt_1_f32:
200; VI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500201; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100202; VI-NEXT: s_mov_b32 s11, 0xf000
203; VI-NEXT: s_mov_b32 s10, -1
204; VI-NEXT: s_mov_b32 s14, s10
205; VI-NEXT: s_mov_b32 s15, s11
206; VI-NEXT: s_waitcnt lgkmcnt(0)
207; VI-NEXT: s_mov_b32 s12, s2
208; VI-NEXT: s_mov_b32 s13, s3
209; VI-NEXT: s_mov_b32 s16, s4
210; VI-NEXT: s_mov_b32 s17, s5
211; VI-NEXT: s_mov_b32 s18, s10
212; VI-NEXT: s_mov_b32 s19, s11
213; VI-NEXT: s_mov_b32 s4, s6
214; VI-NEXT: s_mov_b32 s5, s7
215; VI-NEXT: s_mov_b32 s6, s10
216; VI-NEXT: s_mov_b32 s7, s11
217; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
218; VI-NEXT: s_waitcnt vmcnt(0)
219; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
220; VI-NEXT: s_waitcnt vmcnt(0)
221; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
222; VI-NEXT: s_waitcnt vmcnt(0)
223; VI-NEXT: s_mov_b32 s8, s0
224; VI-NEXT: s_mov_b32 s9, s1
225; VI-NEXT: v_max3_f32 v0, v2, v0, v1
226; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
227; VI-NEXT: s_endpgm
228;
229; GFX9-LABEL: test_fmax3_olt_1_f32:
230; GFX9: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500231; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530232; GFX9-NEXT: s_mov_b32 s3, 0xf000
233; GFX9-NEXT: s_mov_b32 s2, -1
Shilei Tian6548b632024-11-08 20:21:16 -0500234; GFX9-NEXT: s_mov_b32 s6, s2
235; GFX9-NEXT: s_mov_b32 s7, s3
Shilei Tianca336492024-11-08 16:36:10 -0500236; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500237; GFX9-NEXT: s_mov_b32 s4, s10
238; GFX9-NEXT: s_mov_b32 s5, s11
239; GFX9-NEXT: s_mov_b32 s16, s12
240; GFX9-NEXT: s_mov_b32 s17, s13
Shilei Tianca336492024-11-08 16:36:10 -0500241; GFX9-NEXT: s_mov_b32 s18, s2
242; GFX9-NEXT: s_mov_b32 s19, s3
Shilei Tian6548b632024-11-08 20:21:16 -0500243; GFX9-NEXT: s_mov_b32 s12, s14
244; GFX9-NEXT: s_mov_b32 s13, s15
245; GFX9-NEXT: s_mov_b32 s14, s2
246; GFX9-NEXT: s_mov_b32 s15, s3
247; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100248; GFX9-NEXT: s_waitcnt vmcnt(0)
249; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
250; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500251; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100252; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500253; GFX9-NEXT: s_mov_b32 s0, s8
254; GFX9-NEXT: s_mov_b32 s1, s9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100255; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530256; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100257; GFX9-NEXT: s_endpgm
258;
259; GFX11-LABEL: test_fmax3_olt_1_f32:
260; GFX11: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500261; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100262; GFX11-NEXT: s_mov_b32 s10, -1
263; GFX11-NEXT: s_mov_b32 s11, 0x31016000
264; GFX11-NEXT: s_mov_b32 s14, s10
265; GFX11-NEXT: s_mov_b32 s15, s11
266; GFX11-NEXT: s_mov_b32 s18, s10
267; GFX11-NEXT: s_mov_b32 s19, s11
268; GFX11-NEXT: s_mov_b32 s22, s10
269; GFX11-NEXT: s_mov_b32 s23, s11
270; GFX11-NEXT: s_waitcnt lgkmcnt(0)
271; GFX11-NEXT: s_mov_b32 s12, s2
272; GFX11-NEXT: s_mov_b32 s13, s3
273; GFX11-NEXT: s_mov_b32 s16, s4
274; GFX11-NEXT: s_mov_b32 s17, s5
275; GFX11-NEXT: s_mov_b32 s20, s6
276; GFX11-NEXT: s_mov_b32 s21, s7
277; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
278; GFX11-NEXT: s_waitcnt vmcnt(0)
279; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
280; GFX11-NEXT: s_waitcnt vmcnt(0)
281; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
282; GFX11-NEXT: s_waitcnt vmcnt(0)
283; GFX11-NEXT: s_mov_b32 s8, s0
284; GFX11-NEXT: s_mov_b32 s9, s1
285; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
286; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100287; GFX11-NEXT: s_endpgm
Brox Chen8b23ebb2025-01-03 03:55:58 -0500288;
289; GFX12-LABEL: test_fmax3_olt_1_f32:
290; GFX12: ; %bb.0:
291; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
292; GFX12-NEXT: s_mov_b32 s10, -1
293; GFX12-NEXT: s_mov_b32 s11, 0x31016000
294; GFX12-NEXT: s_mov_b32 s14, s10
295; GFX12-NEXT: s_mov_b32 s15, s11
296; GFX12-NEXT: s_mov_b32 s18, s10
297; GFX12-NEXT: s_mov_b32 s19, s11
298; GFX12-NEXT: s_mov_b32 s22, s10
299; GFX12-NEXT: s_mov_b32 s23, s11
300; GFX12-NEXT: s_wait_kmcnt 0x0
301; GFX12-NEXT: s_mov_b32 s12, s2
302; GFX12-NEXT: s_mov_b32 s13, s3
303; GFX12-NEXT: s_mov_b32 s16, s4
304; GFX12-NEXT: s_mov_b32 s17, s5
305; GFX12-NEXT: s_mov_b32 s20, s6
306; GFX12-NEXT: s_mov_b32 s21, s7
307; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
308; GFX12-NEXT: s_wait_loadcnt 0x0
309; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
310; GFX12-NEXT: s_wait_loadcnt 0x0
311; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
312; GFX12-NEXT: s_wait_loadcnt 0x0
313; GFX12-NEXT: s_mov_b32 s8, s0
314; GFX12-NEXT: s_mov_b32 s9, s1
315; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
316; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
317; GFX12-NEXT: s_endpgm
Matt Arsenault177ff422022-11-29 17:49:58 -0500318 %a = load volatile float, ptr addrspace(1) %aptr, align 4
319 %b = load volatile float, ptr addrspace(1) %bptr, align 4
320 %c = load volatile float, ptr addrspace(1) %cptr, align 4
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000321 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
322 %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
Matt Arsenault177ff422022-11-29 17:49:58 -0500323 store float %f1, ptr addrspace(1) %out, align 4
Matt Arsenaultcc3c2b32014-11-14 20:08:52 +0000324 ret void
325}
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000326
Matt Arsenault177ff422022-11-29 17:49:58 -0500327define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100328; SI-LABEL: test_fmax3_olt_0_f16:
329; SI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500330; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100331; SI-NEXT: s_mov_b32 s11, 0xf000
332; SI-NEXT: s_mov_b32 s10, -1
333; SI-NEXT: s_mov_b32 s14, s10
334; SI-NEXT: s_mov_b32 s15, s11
335; SI-NEXT: s_mov_b32 s18, s10
336; SI-NEXT: s_mov_b32 s19, s11
337; SI-NEXT: s_mov_b32 s22, s10
338; SI-NEXT: s_mov_b32 s23, s11
339; SI-NEXT: s_waitcnt lgkmcnt(0)
340; SI-NEXT: s_mov_b32 s12, s2
341; SI-NEXT: s_mov_b32 s13, s3
342; SI-NEXT: s_mov_b32 s16, s4
343; SI-NEXT: s_mov_b32 s17, s5
344; SI-NEXT: s_mov_b32 s20, s6
345; SI-NEXT: s_mov_b32 s21, s7
346; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
347; SI-NEXT: s_waitcnt vmcnt(0)
348; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
349; SI-NEXT: s_waitcnt vmcnt(0)
350; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
351; SI-NEXT: s_waitcnt vmcnt(0)
352; SI-NEXT: s_mov_b32 s8, s0
353; SI-NEXT: s_mov_b32 s9, s1
354; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
355; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
356; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
357; SI-NEXT: v_max3_f32 v0, v0, v1, v2
358; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
359; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
360; SI-NEXT: s_endpgm
361;
362; VI-LABEL: test_fmax3_olt_0_f16:
363; VI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500364; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100365; VI-NEXT: s_mov_b32 s11, 0xf000
366; VI-NEXT: s_mov_b32 s10, -1
367; VI-NEXT: s_mov_b32 s14, s10
368; VI-NEXT: s_mov_b32 s15, s11
369; VI-NEXT: s_waitcnt lgkmcnt(0)
370; VI-NEXT: s_mov_b32 s12, s2
371; VI-NEXT: s_mov_b32 s13, s3
372; VI-NEXT: s_mov_b32 s16, s4
373; VI-NEXT: s_mov_b32 s17, s5
374; VI-NEXT: s_mov_b32 s18, s10
375; VI-NEXT: s_mov_b32 s19, s11
376; VI-NEXT: s_mov_b32 s4, s6
377; VI-NEXT: s_mov_b32 s5, s7
378; VI-NEXT: s_mov_b32 s6, s10
379; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
380; VI-NEXT: s_waitcnt vmcnt(0)
381; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
382; VI-NEXT: s_waitcnt vmcnt(0)
383; VI-NEXT: s_mov_b32 s7, s11
384; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
385; VI-NEXT: s_waitcnt vmcnt(0)
386; VI-NEXT: s_mov_b32 s8, s0
387; VI-NEXT: s_mov_b32 s9, s1
388; VI-NEXT: v_max_f16_e32 v0, v0, v0
389; VI-NEXT: v_max_f16_e32 v1, v1, v1
390; VI-NEXT: v_max_f16_e32 v0, v0, v1
391; VI-NEXT: v_max_f16_e32 v1, v2, v2
392; VI-NEXT: v_max_f16_e32 v0, v0, v1
393; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
394; VI-NEXT: s_endpgm
395;
396; GFX9-LABEL: test_fmax3_olt_0_f16:
397; GFX9: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500398; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530399; GFX9-NEXT: s_mov_b32 s3, 0xf000
400; GFX9-NEXT: s_mov_b32 s2, -1
Shilei Tian6548b632024-11-08 20:21:16 -0500401; GFX9-NEXT: s_mov_b32 s6, s2
402; GFX9-NEXT: s_mov_b32 s7, s3
Shilei Tianca336492024-11-08 16:36:10 -0500403; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500404; GFX9-NEXT: s_mov_b32 s4, s10
405; GFX9-NEXT: s_mov_b32 s5, s11
406; GFX9-NEXT: s_mov_b32 s16, s12
407; GFX9-NEXT: s_mov_b32 s17, s13
Shilei Tianca336492024-11-08 16:36:10 -0500408; GFX9-NEXT: s_mov_b32 s18, s2
409; GFX9-NEXT: s_mov_b32 s19, s3
Shilei Tian6548b632024-11-08 20:21:16 -0500410; GFX9-NEXT: s_mov_b32 s12, s14
411; GFX9-NEXT: s_mov_b32 s13, s15
412; GFX9-NEXT: s_mov_b32 s14, s2
413; GFX9-NEXT: s_mov_b32 s15, s3
414; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100415; GFX9-NEXT: s_waitcnt vmcnt(0)
416; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
417; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500418; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100419; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500420; GFX9-NEXT: s_mov_b32 s0, s8
421; GFX9-NEXT: s_mov_b32 s1, s9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100422; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530423; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100424; GFX9-NEXT: s_endpgm
425;
426; GFX11-LABEL: test_fmax3_olt_0_f16:
427; GFX11: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500428; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100429; GFX11-NEXT: s_mov_b32 s10, -1
430; GFX11-NEXT: s_mov_b32 s11, 0x31016000
431; GFX11-NEXT: s_mov_b32 s14, s10
432; GFX11-NEXT: s_mov_b32 s15, s11
433; GFX11-NEXT: s_mov_b32 s18, s10
434; GFX11-NEXT: s_mov_b32 s19, s11
435; GFX11-NEXT: s_mov_b32 s22, s10
436; GFX11-NEXT: s_mov_b32 s23, s11
437; GFX11-NEXT: s_waitcnt lgkmcnt(0)
438; GFX11-NEXT: s_mov_b32 s12, s2
439; GFX11-NEXT: s_mov_b32 s13, s3
440; GFX11-NEXT: s_mov_b32 s16, s4
441; GFX11-NEXT: s_mov_b32 s17, s5
442; GFX11-NEXT: s_mov_b32 s20, s6
443; GFX11-NEXT: s_mov_b32 s21, s7
444; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
445; GFX11-NEXT: s_waitcnt vmcnt(0)
446; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
447; GFX11-NEXT: s_waitcnt vmcnt(0)
448; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
449; GFX11-NEXT: s_waitcnt vmcnt(0)
450; GFX11-NEXT: s_mov_b32 s8, s0
451; GFX11-NEXT: s_mov_b32 s9, s1
452; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
453; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100454; GFX11-NEXT: s_endpgm
Brox Chen8b23ebb2025-01-03 03:55:58 -0500455;
456; GFX12-LABEL: test_fmax3_olt_0_f16:
457; GFX12: ; %bb.0:
458; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
459; GFX12-NEXT: s_mov_b32 s10, -1
460; GFX12-NEXT: s_mov_b32 s11, 0x31016000
461; GFX12-NEXT: s_mov_b32 s14, s10
462; GFX12-NEXT: s_mov_b32 s15, s11
463; GFX12-NEXT: s_mov_b32 s18, s10
464; GFX12-NEXT: s_mov_b32 s19, s11
465; GFX12-NEXT: s_mov_b32 s22, s10
466; GFX12-NEXT: s_mov_b32 s23, s11
467; GFX12-NEXT: s_wait_kmcnt 0x0
468; GFX12-NEXT: s_mov_b32 s12, s2
469; GFX12-NEXT: s_mov_b32 s13, s3
470; GFX12-NEXT: s_mov_b32 s16, s4
471; GFX12-NEXT: s_mov_b32 s17, s5
472; GFX12-NEXT: s_mov_b32 s20, s6
473; GFX12-NEXT: s_mov_b32 s21, s7
474; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
475; GFX12-NEXT: s_wait_loadcnt 0x0
476; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
477; GFX12-NEXT: s_wait_loadcnt 0x0
478; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
479; GFX12-NEXT: s_wait_loadcnt 0x0
480; GFX12-NEXT: s_mov_b32 s8, s0
481; GFX12-NEXT: s_mov_b32 s9, s1
482; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2
483; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
484; GFX12-NEXT: s_endpgm
Matt Arsenault177ff422022-11-29 17:49:58 -0500485 %a = load volatile half, ptr addrspace(1) %aptr, align 2
486 %b = load volatile half, ptr addrspace(1) %bptr, align 2
487 %c = load volatile half, ptr addrspace(1) %cptr, align 2
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000488 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
489 %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
Matt Arsenault177ff422022-11-29 17:49:58 -0500490 store half %f1, ptr addrspace(1) %out, align 2
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000491 ret void
492}
493
494; Commute operand of second fmax
Matt Arsenault177ff422022-11-29 17:49:58 -0500495define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100496; SI-LABEL: test_fmax3_olt_1_f16:
497; SI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500498; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100499; SI-NEXT: s_mov_b32 s11, 0xf000
500; SI-NEXT: s_mov_b32 s10, -1
501; SI-NEXT: s_mov_b32 s14, s10
502; SI-NEXT: s_mov_b32 s15, s11
503; SI-NEXT: s_mov_b32 s18, s10
504; SI-NEXT: s_mov_b32 s19, s11
505; SI-NEXT: s_mov_b32 s22, s10
506; SI-NEXT: s_mov_b32 s23, s11
507; SI-NEXT: s_waitcnt lgkmcnt(0)
508; SI-NEXT: s_mov_b32 s12, s2
509; SI-NEXT: s_mov_b32 s13, s3
510; SI-NEXT: s_mov_b32 s16, s4
511; SI-NEXT: s_mov_b32 s17, s5
512; SI-NEXT: s_mov_b32 s20, s6
513; SI-NEXT: s_mov_b32 s21, s7
514; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
515; SI-NEXT: s_waitcnt vmcnt(0)
516; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
517; SI-NEXT: s_waitcnt vmcnt(0)
518; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
519; SI-NEXT: s_waitcnt vmcnt(0)
520; SI-NEXT: s_mov_b32 s8, s0
521; SI-NEXT: s_mov_b32 s9, s1
522; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
523; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
524; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
525; SI-NEXT: v_max3_f32 v0, v2, v0, v1
526; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
527; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
528; SI-NEXT: s_endpgm
529;
530; VI-LABEL: test_fmax3_olt_1_f16:
531; VI: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500532; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100533; VI-NEXT: s_mov_b32 s11, 0xf000
534; VI-NEXT: s_mov_b32 s10, -1
535; VI-NEXT: s_mov_b32 s14, s10
536; VI-NEXT: s_mov_b32 s15, s11
537; VI-NEXT: s_waitcnt lgkmcnt(0)
538; VI-NEXT: s_mov_b32 s12, s2
539; VI-NEXT: s_mov_b32 s13, s3
540; VI-NEXT: s_mov_b32 s16, s4
541; VI-NEXT: s_mov_b32 s17, s5
542; VI-NEXT: s_mov_b32 s18, s10
543; VI-NEXT: s_mov_b32 s19, s11
544; VI-NEXT: s_mov_b32 s4, s6
545; VI-NEXT: s_mov_b32 s5, s7
546; VI-NEXT: s_mov_b32 s6, s10
547; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
548; VI-NEXT: s_waitcnt vmcnt(0)
549; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
550; VI-NEXT: s_waitcnt vmcnt(0)
551; VI-NEXT: s_mov_b32 s7, s11
552; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
553; VI-NEXT: s_waitcnt vmcnt(0)
554; VI-NEXT: s_mov_b32 s8, s0
555; VI-NEXT: s_mov_b32 s9, s1
556; VI-NEXT: v_max_f16_e32 v0, v0, v0
557; VI-NEXT: v_max_f16_e32 v1, v1, v1
558; VI-NEXT: v_max_f16_e32 v0, v0, v1
559; VI-NEXT: v_max_f16_e32 v1, v2, v2
560; VI-NEXT: v_max_f16_e32 v0, v1, v0
561; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
562; VI-NEXT: s_endpgm
563;
564; GFX9-LABEL: test_fmax3_olt_1_f16:
565; GFX9: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500566; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530567; GFX9-NEXT: s_mov_b32 s3, 0xf000
568; GFX9-NEXT: s_mov_b32 s2, -1
Shilei Tian6548b632024-11-08 20:21:16 -0500569; GFX9-NEXT: s_mov_b32 s6, s2
570; GFX9-NEXT: s_mov_b32 s7, s3
Shilei Tianca336492024-11-08 16:36:10 -0500571; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500572; GFX9-NEXT: s_mov_b32 s4, s10
573; GFX9-NEXT: s_mov_b32 s5, s11
574; GFX9-NEXT: s_mov_b32 s16, s12
575; GFX9-NEXT: s_mov_b32 s17, s13
Shilei Tianca336492024-11-08 16:36:10 -0500576; GFX9-NEXT: s_mov_b32 s18, s2
577; GFX9-NEXT: s_mov_b32 s19, s3
Shilei Tian6548b632024-11-08 20:21:16 -0500578; GFX9-NEXT: s_mov_b32 s12, s14
579; GFX9-NEXT: s_mov_b32 s13, s15
580; GFX9-NEXT: s_mov_b32 s14, s2
581; GFX9-NEXT: s_mov_b32 s15, s3
582; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100583; GFX9-NEXT: s_waitcnt vmcnt(0)
584; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
585; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500586; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100587; GFX9-NEXT: s_waitcnt vmcnt(0)
Shilei Tian6548b632024-11-08 20:21:16 -0500588; GFX9-NEXT: s_mov_b32 s0, s8
589; GFX9-NEXT: s_mov_b32 s1, s9
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100590; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
Christudasan Devadasan229e1182024-07-23 13:59:15 +0530591; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100592; GFX9-NEXT: s_endpgm
593;
594; GFX11-LABEL: test_fmax3_olt_1_f16:
595; GFX11: ; %bb.0:
Shilei Tian6548b632024-11-08 20:21:16 -0500596; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100597; GFX11-NEXT: s_mov_b32 s10, -1
598; GFX11-NEXT: s_mov_b32 s11, 0x31016000
599; GFX11-NEXT: s_mov_b32 s14, s10
600; GFX11-NEXT: s_mov_b32 s15, s11
601; GFX11-NEXT: s_mov_b32 s18, s10
602; GFX11-NEXT: s_mov_b32 s19, s11
603; GFX11-NEXT: s_mov_b32 s22, s10
604; GFX11-NEXT: s_mov_b32 s23, s11
605; GFX11-NEXT: s_waitcnt lgkmcnt(0)
606; GFX11-NEXT: s_mov_b32 s12, s2
607; GFX11-NEXT: s_mov_b32 s13, s3
608; GFX11-NEXT: s_mov_b32 s16, s4
609; GFX11-NEXT: s_mov_b32 s17, s5
610; GFX11-NEXT: s_mov_b32 s20, s6
611; GFX11-NEXT: s_mov_b32 s21, s7
612; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
613; GFX11-NEXT: s_waitcnt vmcnt(0)
614; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
615; GFX11-NEXT: s_waitcnt vmcnt(0)
616; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
617; GFX11-NEXT: s_waitcnt vmcnt(0)
618; GFX11-NEXT: s_mov_b32 s8, s0
619; GFX11-NEXT: s_mov_b32 s9, s1
620; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
621; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100622; GFX11-NEXT: s_endpgm
Brox Chen8b23ebb2025-01-03 03:55:58 -0500623;
624; GFX12-LABEL: test_fmax3_olt_1_f16:
625; GFX12: ; %bb.0:
626; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
627; GFX12-NEXT: s_mov_b32 s10, -1
628; GFX12-NEXT: s_mov_b32 s11, 0x31016000
629; GFX12-NEXT: s_mov_b32 s14, s10
630; GFX12-NEXT: s_mov_b32 s15, s11
631; GFX12-NEXT: s_mov_b32 s18, s10
632; GFX12-NEXT: s_mov_b32 s19, s11
633; GFX12-NEXT: s_mov_b32 s22, s10
634; GFX12-NEXT: s_mov_b32 s23, s11
635; GFX12-NEXT: s_wait_kmcnt 0x0
636; GFX12-NEXT: s_mov_b32 s12, s2
637; GFX12-NEXT: s_mov_b32 s13, s3
638; GFX12-NEXT: s_mov_b32 s16, s4
639; GFX12-NEXT: s_mov_b32 s17, s5
640; GFX12-NEXT: s_mov_b32 s20, s6
641; GFX12-NEXT: s_mov_b32 s21, s7
642; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
643; GFX12-NEXT: s_wait_loadcnt 0x0
644; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
645; GFX12-NEXT: s_wait_loadcnt 0x0
646; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
647; GFX12-NEXT: s_wait_loadcnt 0x0
648; GFX12-NEXT: s_mov_b32 s8, s0
649; GFX12-NEXT: s_mov_b32 s9, s1
650; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1
651; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
652; GFX12-NEXT: s_endpgm
Matt Arsenault177ff422022-11-29 17:49:58 -0500653 %a = load volatile half, ptr addrspace(1) %aptr, align 2
654 %b = load volatile half, ptr addrspace(1) %bptr, align 2
655 %c = load volatile half, ptr addrspace(1) %cptr, align 2
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000656 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
657 %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
Matt Arsenault177ff422022-11-29 17:49:58 -0500658 store half %f1, ptr addrspace(1) %out, align 2
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000659 ret void
660}
661
Farhana Aleene80aeac2018-04-03 23:00:30 +0000662; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3
663; since there are no pack instructions for fmax3.
Matt Arsenault687ec752018-10-22 16:27:27 +0000664define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100665; SI-LABEL: no_fmax3_v2f16:
666; SI: ; %bb.0: ; %entry
667; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
669; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
670; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
671; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
672; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
673; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
674; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
675; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
676; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
677; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
678; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
679; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
680; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
681; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
682; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
683; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
684; SI-NEXT: v_max_f32_e32 v1, v1, v3
685; SI-NEXT: v_max_f32_e32 v0, v0, v2
686; SI-NEXT: v_max3_f32 v0, v4, v0, v6
687; SI-NEXT: v_max3_f32 v1, v5, v1, v7
688; SI-NEXT: s_setpc_b64 s[30:31]
689;
690; VI-LABEL: no_fmax3_v2f16:
691; VI: ; %bb.0: ; %entry
692; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
693; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
694; VI-NEXT: v_max_f16_e32 v0, v0, v1
695; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
696; VI-NEXT: v_max_f16_e32 v0, v2, v0
697; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
698; VI-NEXT: v_max_f16_e32 v0, v0, v3
699; VI-NEXT: v_or_b32_e32 v0, v0, v1
700; VI-NEXT: s_setpc_b64 s[30:31]
701;
702; GFX9-LABEL: no_fmax3_v2f16:
703; GFX9: ; %bb.0: ; %entry
704; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
706; GFX9-NEXT: v_pk_max_f16 v0, v2, v0
707; GFX9-NEXT: v_pk_max_f16 v0, v0, v3
708; GFX9-NEXT: s_setpc_b64 s[30:31]
709;
710; GFX11-LABEL: no_fmax3_v2f16:
711; GFX11: ; %bb.0: ; %entry
712; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Ivan Kosarev150c73a2023-06-14 11:40:48 +0100713; GFX11-NEXT: v_pk_max_f16 v0, v0, v1
714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
715; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
716; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
717; GFX11-NEXT: s_setpc_b64 s[30:31]
Brox Chen8b23ebb2025-01-03 03:55:58 -0500718;
719; GFX12-LABEL: no_fmax3_v2f16:
720; GFX12: ; %bb.0: ; %entry
721; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
722; GFX12-NEXT: s_wait_expcnt 0x0
723; GFX12-NEXT: s_wait_samplecnt 0x0
724; GFX12-NEXT: s_wait_bvhcnt 0x0
725; GFX12-NEXT: s_wait_kmcnt 0x0
726; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
727; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
728; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
729; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
730; GFX12-NEXT: s_setpc_b64 s[30:31]
Farhana Aleene80aeac2018-04-03 23:00:30 +0000731entry:
Matt Arsenault687ec752018-10-22 16:27:27 +0000732 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
733 %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
734 %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
Farhana Aleene80aeac2018-04-03 23:00:30 +0000735 ret <2 x half> %res
736}
737
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000738declare i32 @llvm.amdgcn.workitem.id.x() #1
739declare float @llvm.maxnum.f32(float, float) #1
740declare half @llvm.maxnum.f16(half, half) #1
Farhana Aleene80aeac2018-04-03 23:00:30 +0000741declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
Matt Arsenaultee324ff2017-05-17 19:25:06 +0000742
743attributes #0 = { nounwind }
744attributes #1 = { nounwind readnone speculatable }
Matt Arsenault687ec752018-10-22 16:27:27 +0000745attributes #2 = { nounwind "no-nans-fp-math"="true" }