| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s |
| ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s |
| |
| ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s |
| ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s |
| |
| ; FIXME: This should also fold when fma is actually fast if an FMA |
| ; exists in the original program. |
| |
| ; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z)) |
| define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { |
| ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fadd fast float %fma, %z |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mad_f32 v2, v3, v4, -v2 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, -v2 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fsub fast float %fma, %z |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { |
| ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| store volatile float %mul.u.v, ptr addrspace(1) undef |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fadd fast float %fma, %z |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 { |
| ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v2, v0 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| store volatile float %mul.u.v, ptr addrspace(1) undef |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fadd fast float %z, %fma |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 { |
| ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| store volatile float %fma, ptr addrspace(1) undef |
| %add = fadd fast float %fma, %z |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 { |
| ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v2, v0 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| store volatile float %fma, ptr addrspace(1) undef |
| %add = fadd fast float %z, %fma |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_mul: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mad_f32 v0, v0, v1, v3 |
| ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %sub = fsub fast float %fma, %z |
| store volatile float %mul.u.v, ptr addrspace(1) undef |
| store volatile float %sub, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fsub fast float %fma, %z |
| store volatile float %fma, ptr addrspace(1) undef |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile float, ptr addrspace(1) undef |
| %v = load volatile float, ptr addrspace(1) undef |
| %mul.u.v = fmul fast float %u, %v |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fsub fast float %z, %fma |
| store volatile float %fma, ptr addrspace(1) undef |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile half, ptr addrspace(1) undef |
| %v = load volatile half, ptr addrspace(1) undef |
| %mul.u.v.half = fmul fast half %u, %v |
| %mul.u.v = fpext half %mul.u.v.half to float |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fsub fast float %fma, %z |
| store volatile float %fma, ptr addrspace(1) undef |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() #0 { |
| ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs: |
| ; GCN-FLUSH: ; %bb.0: |
| ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1 |
| ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3 |
| ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FLUSH-NEXT: s_endpgm |
| ; |
| ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs: |
| ; GCN-FASTFMA: ; %bb.0: |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4 |
| ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3 |
| ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0 |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-FASTFMA-NEXT: s_endpgm |
| ; |
| ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs: |
| ; GCN-SLOWFMA: ; %bb.0: |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1 |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v3, v3 |
| ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4 |
| ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1 |
| ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0 |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0 |
| ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-SLOWFMA-NEXT: s_endpgm |
| %x = load volatile float, ptr addrspace(1) undef |
| %y = load volatile float, ptr addrspace(1) undef |
| %z = load volatile float, ptr addrspace(1) undef |
| %u = load volatile half, ptr addrspace(1) undef |
| %v = load volatile half, ptr addrspace(1) undef |
| %mul.u.v.half = fmul fast half %u, %v |
| %mul.u.v = fpext half %mul.u.v.half to float |
| %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v) |
| %add = fsub fast float %z, %fma |
| store volatile float %fma, ptr addrspace(1) undef |
| store volatile float %add, ptr addrspace(1) undef |
| ret void |
| } |
| |
| declare float @llvm.fma.f32(float, float, float) #1 |
| declare float @llvm.fmuladd.f32(float, float, float) #1 |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind readnone } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; GCN: {{.*}} |