blob: 6db50bcfa62c295009f14d60ec2acdde50fd9263 [file] [log] [blame]
Stanislav Mekhanoshin9ee272f2020-06-15 14:10:39 -07001; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
2; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
Matt Arsenaulte4824032017-01-11 02:02:12 +00003
Stanislav Mekhanoshin9ee272f2020-06-15 14:10:39 -07004; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
5; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
Matt Arsenaulte4824032017-01-11 02:02:12 +00006
7; FIXME: This should also fold when fma is actually fast if an FMA
8; exists in the original program.
9
10; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
11
12; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
13; GCN: buffer_load_dword [[X:v[0-9]+]]
14; GCN: buffer_load_dword [[Y:v[0-9]+]]
15; GCN: buffer_load_dword [[Z:v[0-9]+]]
16; GCN: buffer_load_dword [[U:v[0-9]+]]
17; GCN: buffer_load_dword [[V:v[0-9]+]]
18
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000019; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
20; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
Matt Arsenaulte4824032017-01-11 02:02:12 +000021; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
22
23; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
24; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
25; GCN-FASTFMA: buffer_store_dword [[FMA1]]
26
27; GCN-SLOWFMA: v_mul_f32_e32
28; GCN-SLOWFMA: v_mul_f32_e32
29; GCN-SLOWFMA: v_add_f32_e32
30; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000031define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +000032 %x = load volatile float, float addrspace(1)* undef
33 %y = load volatile float, float addrspace(1)* undef
34 %z = load volatile float, float addrspace(1)* undef
35 %u = load volatile float, float addrspace(1)* undef
36 %v = load volatile float, float addrspace(1)* undef
37 %mul.u.v = fmul fast float %u, %v
38 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
39 %add = fadd fast float %fma, %z
40 store volatile float %add, float addrspace(1)* undef
41 ret void
42}
43
44; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
45; GCN: buffer_load_dword [[X:v[0-9]+]]
46; GCN: buffer_load_dword [[Y:v[0-9]+]]
47; GCN: buffer_load_dword [[Z:v[0-9]+]]
48; GCN: buffer_load_dword [[U:v[0-9]+]]
49; GCN: buffer_load_dword [[V:v[0-9]+]]
50
51; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000052; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
Matt Arsenaulte4824032017-01-11 02:02:12 +000053; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
54
55; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
56; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
57; GCN-FASTFMA: buffer_store_dword [[FMA1]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000058define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +000059 %x = load volatile float, float addrspace(1)* undef
60 %y = load volatile float, float addrspace(1)* undef
61 %z = load volatile float, float addrspace(1)* undef
62 %u = load volatile float, float addrspace(1)* undef
63 %v = load volatile float, float addrspace(1)* undef
64 %mul.u.v = fmul fast float %u, %v
65 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
66 %add = fsub fast float %fma, %z
67 store volatile float %add, float addrspace(1)* undef
68 ret void
69}
70
71; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
72; GCN: buffer_load_dword [[X:v[0-9]+]]
73; GCN: buffer_load_dword [[Y:v[0-9]+]]
74; GCN: buffer_load_dword [[Z:v[0-9]+]]
75; GCN: buffer_load_dword [[U:v[0-9]+]]
76; GCN: buffer_load_dword [[V:v[0-9]+]]
77
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000078; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
79; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
80; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +000081
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000082; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
Matt Arsenaulte4824032017-01-11 02:02:12 +000083; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +000084; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +000085
86; GCN-SLOWFMA: v_mul_f32_e32
87; GCN-SLOWFMA: v_mul_f32_e32
88; GCN-SLOWFMA: v_add_f32_e32
89; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000090define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +000091 %x = load volatile float, float addrspace(1)* undef
92 %y = load volatile float, float addrspace(1)* undef
93 %z = load volatile float, float addrspace(1)* undef
94 %u = load volatile float, float addrspace(1)* undef
95 %v = load volatile float, float addrspace(1)* undef
96 %mul.u.v = fmul fast float %u, %v
97 store volatile float %mul.u.v, float addrspace(1)* undef
98 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
99 %add = fadd fast float %fma, %z
100 store volatile float %add, float addrspace(1)* undef
101 ret void
102}
103
104; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
105; GCN: buffer_load_dword [[X:v[0-9]+]]
106; GCN: buffer_load_dword [[Y:v[0-9]+]]
107; GCN: buffer_load_dword [[Z:v[0-9]+]]
108; GCN: buffer_load_dword [[U:v[0-9]+]]
109; GCN: buffer_load_dword [[V:v[0-9]+]]
110
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000111; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
112; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
113; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000114
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000115; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000116; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000117; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000118
119; GCN-SLOWFMA: v_mul_f32_e32
120; GCN-SLOWFMA: v_mul_f32_e32
121; GCN-SLOWFMA: v_add_f32_e32
122; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000123define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +0000124 %x = load volatile float, float addrspace(1)* undef
125 %y = load volatile float, float addrspace(1)* undef
126 %z = load volatile float, float addrspace(1)* undef
127 %u = load volatile float, float addrspace(1)* undef
128 %v = load volatile float, float addrspace(1)* undef
129 %mul.u.v = fmul fast float %u, %v
130 store volatile float %mul.u.v, float addrspace(1)* undef
131 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
132 %add = fadd fast float %z, %fma
133 store volatile float %add, float addrspace(1)* undef
134 ret void
135}
136
137; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
138; GCN: buffer_load_dword [[X:v[0-9]+]]
139; GCN: buffer_load_dword [[Y:v[0-9]+]]
140; GCN: buffer_load_dword [[Z:v[0-9]+]]
141; GCN: buffer_load_dword [[U:v[0-9]+]]
142; GCN: buffer_load_dword [[V:v[0-9]+]]
143
144; GCN-SLOWFMA: v_mul_f32_e32
145; GCN-SLOWFMA: v_mul_f32_e32
146; GCN-SLOWFMA: v_add_f32_e32
147; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000148define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +0000149 %x = load volatile float, float addrspace(1)* undef
150 %y = load volatile float, float addrspace(1)* undef
151 %z = load volatile float, float addrspace(1)* undef
152 %u = load volatile float, float addrspace(1)* undef
153 %v = load volatile float, float addrspace(1)* undef
154 %mul.u.v = fmul fast float %u, %v
155 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
156 store volatile float %fma, float addrspace(1)* undef
157 %add = fadd fast float %fma, %z
158 store volatile float %add, float addrspace(1)* undef
159 ret void
160}
161
162; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
163; GCN: buffer_load_dword [[X:v[0-9]+]]
164; GCN: buffer_load_dword [[Y:v[0-9]+]]
165; GCN: buffer_load_dword [[Z:v[0-9]+]]
166; GCN: buffer_load_dword [[U:v[0-9]+]]
167; GCN: buffer_load_dword [[V:v[0-9]+]]
168
169; GCN-SLOWFMA: v_mul_f32_e32
170; GCN-SLOWFMA: v_mul_f32_e32
171; GCN-SLOWFMA: v_add_f32_e32
172; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000173define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +0000174 %x = load volatile float, float addrspace(1)* undef
175 %y = load volatile float, float addrspace(1)* undef
176 %z = load volatile float, float addrspace(1)* undef
177 %u = load volatile float, float addrspace(1)* undef
178 %v = load volatile float, float addrspace(1)* undef
179 %mul.u.v = fmul fast float %u, %v
180 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
181 store volatile float %fma, float addrspace(1)* undef
182 %add = fadd fast float %z, %fma
183 store volatile float %add, float addrspace(1)* undef
184 ret void
185}
186
187; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
188; GCN: buffer_load_dword [[X:v[0-9]+]]
189; GCN: buffer_load_dword [[Y:v[0-9]+]]
190; GCN: buffer_load_dword [[Z:v[0-9]+]]
191; GCN: buffer_load_dword [[U:v[0-9]+]]
192; GCN: buffer_load_dword [[V:v[0-9]+]]
193
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000194; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000195
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000196; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
197; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000198
199; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000200; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000201
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000202; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000203; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000204; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000205
206; GCN: buffer_store_dword [[MUL]]
207; GCN: buffer_store_dword [[MAD]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000208define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +0000209 %x = load volatile float, float addrspace(1)* undef
210 %y = load volatile float, float addrspace(1)* undef
211 %z = load volatile float, float addrspace(1)* undef
212 %u = load volatile float, float addrspace(1)* undef
213 %v = load volatile float, float addrspace(1)* undef
214 %mul.u.v = fmul fast float %u, %v
215 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
216 %add = fsub fast float %fma, %z
217 store volatile float %mul.u.v, float addrspace(1)* undef
218 store volatile float %add, float addrspace(1)* undef
219 ret void
220}
221
Carl Ritson27916672019-12-20 12:51:50 +0900222; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
Matt Arsenaulte4824032017-01-11 02:02:12 +0000223; GCN: buffer_load_dword [[X:v[0-9]+]]
224; GCN: buffer_load_dword [[Y:v[0-9]+]]
225; GCN: buffer_load_dword [[Z:v[0-9]+]]
226; GCN: buffer_load_dword [[U:v[0-9]+]]
227; GCN: buffer_load_dword [[V:v[0-9]+]]
228
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000229; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000230
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000231; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
232; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000233; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
Tony2f499b92020-12-19 02:05:11 +0000234; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Matt Arsenaulte4824032017-01-11 02:02:12 +0000235; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000236; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Matt Arsenaulte4824032017-01-11 02:02:12 +0000237
238; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000239; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000240; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
Tony2f499b92020-12-19 02:05:11 +0000241; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Matt Arsenaulte4824032017-01-11 02:02:12 +0000242; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000243; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Matt Arsenaulte4824032017-01-11 02:02:12 +0000244
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000245; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
Matt Arsenaulte4824032017-01-11 02:02:12 +0000246; GCN-SLOWFMA: v_add_f32_e32
Matt Arsenault6c29c5a2017-07-10 19:53:57 +0000247; GCN-SLOWFMA: v_sub_f32_e32
Carl Ritson27916672019-12-20 12:51:50 +0900248define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
Matt Arsenaulte4824032017-01-11 02:02:12 +0000249 %x = load volatile float, float addrspace(1)* undef
250 %y = load volatile float, float addrspace(1)* undef
251 %z = load volatile float, float addrspace(1)* undef
252 %u = load volatile float, float addrspace(1)* undef
253 %v = load volatile float, float addrspace(1)* undef
254 %mul.u.v = fmul fast float %u, %v
255 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
256 %add = fsub fast float %fma, %z
257 store volatile float %fma, float addrspace(1)* undef
258 store volatile float %add, float addrspace(1)* undef
259 ret void
260}
261
Carl Ritson27916672019-12-20 12:51:50 +0900262; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
263; GCN: buffer_load_dword [[X:v[0-9]+]]
264; GCN: buffer_load_dword [[Y:v[0-9]+]]
265; GCN: buffer_load_dword [[Z:v[0-9]+]]
266; GCN: buffer_load_dword [[U:v[0-9]+]]
267; GCN: buffer_load_dword [[V:v[0-9]+]]
268
269; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
270
271; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
272; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
273; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
Tony2f499b92020-12-19 02:05:11 +0000274; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900275; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000276; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900277
278; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
279; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
280; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
Tony2f499b92020-12-19 02:05:11 +0000281; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900282; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000283; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900284
285; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
286; GCN-SLOWFMA: v_add_f32_e32
287; GCN-SLOWFMA: v_sub_f32_e32
288define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
289 %x = load volatile float, float addrspace(1)* undef
290 %y = load volatile float, float addrspace(1)* undef
291 %z = load volatile float, float addrspace(1)* undef
292 %u = load volatile float, float addrspace(1)* undef
293 %v = load volatile float, float addrspace(1)* undef
294 %mul.u.v = fmul fast float %u, %v
295 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
296 %add = fsub fast float %z, %fma
297 store volatile float %fma, float addrspace(1)* undef
298 store volatile float %add, float addrspace(1)* undef
299 ret void
300}
301
302; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
303; GCN: buffer_load_dword [[X:v[0-9]+]]
304; GCN: buffer_load_dword [[Y:v[0-9]+]]
305; GCN: buffer_load_dword [[Z:v[0-9]+]]
306; GCN: buffer_load_ushort [[U:v[0-9]+]]
307; GCN: buffer_load_ushort [[V:v[0-9]+]]
308
309; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]]
310; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]]
311; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]]
312
313; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
314; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
315; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
Tony2f499b92020-12-19 02:05:11 +0000316; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900317; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000318; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900319
320; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
321; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
322; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
Tony2f499b92020-12-19 02:05:11 +0000323; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900324; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000325; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900326
327; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
328; GCN-SLOWFMA: v_add_f32_e32
329; GCN-SLOWFMA: v_sub_f32_e32
330define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #0 {
331 %x = load volatile float, float addrspace(1)* undef
332 %y = load volatile float, float addrspace(1)* undef
333 %z = load volatile float, float addrspace(1)* undef
334 %u = load volatile half, half addrspace(1)* undef
335 %v = load volatile half, half addrspace(1)* undef
336 %mul.u.v.half = fmul fast half %u, %v
337 %mul.u.v = fpext half %mul.u.v.half to float
338 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
339 %add = fsub fast float %fma, %z
340 store volatile float %fma, float addrspace(1)* undef
341 store volatile float %add, float addrspace(1)* undef
342 ret void
343}
344
345; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
346; GCN: buffer_load_dword [[X:v[0-9]+]]
347; GCN: buffer_load_dword [[Y:v[0-9]+]]
348; GCN: buffer_load_dword [[Z:v[0-9]+]]
349; GCN: buffer_load_ushort [[U:v[0-9]+]]
350; GCN: buffer_load_ushort [[V:v[0-9]+]]
351
352; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]]
353; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]]
354; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]]
355
356; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
357; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
358; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
Tony2f499b92020-12-19 02:05:11 +0000359; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900360; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000361; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900362
363; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
364; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
365; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
Tony2f499b92020-12-19 02:05:11 +0000366; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900367; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
Tony2f499b92020-12-19 02:05:11 +0000368; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
Carl Ritson27916672019-12-20 12:51:50 +0900369
370; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
371; GCN-SLOWFMA: v_add_f32_e32
372; GCN-SLOWFMA: v_sub_f32_e32
373define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() #0 {
374 %x = load volatile float, float addrspace(1)* undef
375 %y = load volatile float, float addrspace(1)* undef
376 %z = load volatile float, float addrspace(1)* undef
377 %u = load volatile half, half addrspace(1)* undef
378 %v = load volatile half, half addrspace(1)* undef
379 %mul.u.v.half = fmul fast half %u, %v
380 %mul.u.v = fpext half %mul.u.v.half to float
381 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
382 %add = fsub fast float %z, %fma
383 store volatile float %fma, float addrspace(1)* undef
384 store volatile float %add, float addrspace(1)* undef
385 ret void
386}
387
Matt Arsenaulte4824032017-01-11 02:02:12 +0000388declare float @llvm.fma.f32(float, float, float) #1
389declare float @llvm.fmuladd.f32(float, float, float) #1
390
391attributes #0 = { nounwind }
392attributes #1 = { nounwind readnone }