blob: 85ad1cb38abcde13ce19f47bcfa3568bc90da968 [file] [log] [blame] [edit]
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -mattr=+tgsplit -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9-TGS %s
# Check workgroup fences on GFX9 do not add scheduling latency.
# s_barrier should occur before s_waitcnts to hide load latency.
# Latency should still be added when tg-split is enabled.
# This allows merging of pre-barrier atomic fence with waits on loads.
# LLVM IR to help syncscope IDs match MIR
# SSID 2 = workgroup
# SSID 3 = wavefront
--- |
define amdgpu_kernel void @test_workgroup() {
; GFX9-LABEL: test_workgroup:
; GFX9: ; %bb.0:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: global_load_ushort v14, v[0:1], off
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: global_load_ushort v15, v[4:5], off
; GFX9-NEXT: ; implicit-def: $vgpr2
; GFX9-NEXT: v_add_u32_e32 v0, 1, v2
; GFX9-NEXT: ; implicit-def: $vgpr7
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v7
; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 14, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1
; GFX9-NEXT: ; implicit-def: $vgpr6
; GFX9-NEXT: v_add3_u32 v2, v6, v1, v0
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3]
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[4:5]
; GFX9-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5]
; GFX9-NEXT: global_load_ushort v17, v[2:3], off
; GFX9-NEXT: global_load_ushort v18, v[0:1], off
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: ; implicit-def: $vgpr12_vgpr13
; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5]
; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3]
; GFX9-NEXT: s_barrier
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_cvt_f32_f16_e32 v14, v14
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_cvt_f32_f16_e32 v16, v15
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_f16_e32 v17, v17
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v15, v18
; GFX9-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1]
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: global_store_short v[4:5], v1, off
;
; GFX9-TGS-LABEL: test_workgroup:
; GFX9-TGS: ; %bb.0:
; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-TGS-NEXT: global_load_ushort v14, v[0:1], off
; GFX9-TGS-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-TGS-NEXT: global_load_ushort v15, v[4:5], off
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2
; GFX9-TGS-NEXT: v_add_u32_e32 v0, 1, v2
; GFX9-TGS-NEXT: ; implicit-def: $vgpr7
; GFX9-TGS-NEXT: v_mul_hi_u32 v1, v0, v7
; GFX9-TGS-NEXT: v_sub_u32_e32 v2, v0, v1
; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX9-TGS-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v1, 14, v1
; GFX9-TGS-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1
; GFX9-TGS-NEXT: ; implicit-def: $vgpr6
; GFX9-TGS-NEXT: v_add3_u32 v2, v6, v1, v0
; GFX9-TGS-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3]
; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5]
; GFX9-TGS-NEXT: global_load_ushort v18, v[0:1], off
; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[4:5]
; GFX9-TGS-NEXT: global_load_ushort v17, v[0:1], off
; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX9-TGS-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX9-TGS-NEXT: ; implicit-def: $vgpr10_vgpr11
; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-TGS-NEXT: ; implicit-def: $vgpr12_vgpr13
; GFX9-TGS-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-TGS-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5]
; GFX9-TGS-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3]
; GFX9-TGS-NEXT: s_waitcnt vmcnt(3)
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v14, v14
; GFX9-TGS-NEXT: s_waitcnt vmcnt(2)
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v16, v15
; GFX9-TGS-NEXT: s_waitcnt vmcnt(1)
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v15, v18
; GFX9-TGS-NEXT: s_waitcnt vmcnt(0)
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v17, v17
; GFX9-TGS-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1]
; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
; GFX9-TGS-NEXT: s_nop 0
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1]
; GFX9-TGS-NEXT: s_barrier
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1]
; GFX9-TGS-NEXT: buffer_inv sc0
; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1]
; GFX9-TGS-NEXT: s_nop 0
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1]
; GFX9-TGS-NEXT: s_nop 0
; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-TGS-NEXT: global_store_short v[2:3], v0, off
; GFX9-TGS-NEXT: global_store_short v[4:5], v1, off
fence syncscope("workgroup") acq_rel
fence syncscope("wavefront") acq_rel
ret void
}
...
---
name: test_workgroup
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
%0:sgpr_256 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_96_align2 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
%4:vreg_64_align2 = IMPLICIT_DEF
%5:vreg_64_align2 = IMPLICIT_DEF
%6:vreg_64_align2 = IMPLICIT_DEF
%7:sgpr_64 = IMPLICIT_DEF
%8:vreg_64_align2 = IMPLICIT_DEF
%9:vreg_64_align2 = IMPLICIT_DEF
%10:vreg_64_align2 = IMPLICIT_DEF
%11:vreg_64_align2 = IMPLICIT_DEF
%12:vgpr_32 = IMPLICIT_DEF
%13:vgpr_32 = GLOBAL_LOAD_USHORT %5:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
%14:vreg_64_align2 = IMPLICIT_DEF
%15:vgpr_32 = GLOBAL_LOAD_USHORT %14:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
%16:vgpr_32 = V_ADD_U32_e32 1, %12:vgpr_32, implicit $exec
%17:vgpr_32 = V_MUL_HI_U32_e64 %16:vgpr_32, %3:vgpr_32, implicit $exec
%18:vgpr_32 = V_SUB_U32_e32 %16:vgpr_32, %17:vgpr_32, implicit $exec
%19:vgpr_32 = V_LSHRREV_B32_e32 1, %18:vgpr_32, implicit $exec
%20:vgpr_32 = V_ADD_U32_e32 %19:vgpr_32, %17:vgpr_32, implicit $exec
%21:vgpr_32 = V_LSHRREV_B32_e32 14, %20:vgpr_32, implicit $exec
%22:vgpr_32 = V_MUL_U32_U24_e32 1361367, %21:vgpr_32, implicit $exec
%6.sub0:vreg_64_align2 = V_ADD3_U32_e64 %1:vgpr_32, %22:vgpr_32, %16:vgpr_32, implicit $exec
%23:vreg_64_align2 = nuw nsw V_LSHLREV_B64_e64 1, %6:vreg_64_align2, implicit $exec
%24:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub2_sub3:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
%25:vgpr_32 = GLOBAL_LOAD_USHORT %24:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
%26:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub0_sub1:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
%27:vgpr_32 = GLOBAL_LOAD_USHORT %26:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
undef %28.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %13:vgpr_32, implicit $mode, implicit $exec
%28.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %25:vgpr_32, implicit $mode, implicit $exec
undef %29.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %15:vgpr_32, implicit $mode, implicit $exec
%29.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %27:vgpr_32, implicit $mode, implicit $exec
%30:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %29:vreg_64_align2, 11, %2.sub0_sub1:vreg_96_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, %7:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%32:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %9:vreg_64_align2, 8, %30:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%33:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %11:vreg_64_align2, 8, %32:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%34:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %31:vreg_64_align2, 8, %33:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%35:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %8:vreg_64_align2, 8, %34:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
ATOMIC_FENCE 5, 2
S_BARRIER
ATOMIC_FENCE 4, 2
%36:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub0:vreg_64_align2, implicit $mode, implicit $exec
%37:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %4:vreg_64_align2, implicit $exec
GLOBAL_STORE_SHORT %37:vreg_64_align2, %36:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1)
%38:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub1:vreg_64_align2, implicit $mode, implicit $exec
%39:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
GLOBAL_STORE_SHORT %39:vreg_64_align2, %38:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1)
...