| # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| # RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9 %s |
| # RUN: llc -mtriple=amdgcn -mcpu=gfx950 -mattr=+tgsplit -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9-TGS %s |
| |
| # Check workgroup fences on GFX9 do not add scheduling latency. |
| # s_barrier should occur before s_waitcnts to hide load latency. |
| # Latency should still be added when tg-split is enabled. |
| # This allows merging of pre-barrier atomic fence with waits on loads. |
| |
| # LLVM IR to help syncscope IDs match MIR |
| # SSID 2 = workgroup |
| # SSID 3 = wavefront |
| --- | |
| define amdgpu_kernel void @test_workgroup() { |
| ; GFX9-LABEL: test_workgroup: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; GFX9-NEXT: global_load_ushort v14, v[0:1], off |
| ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; GFX9-NEXT: global_load_ushort v15, v[4:5], off |
| ; GFX9-NEXT: ; implicit-def: $vgpr2 |
| ; GFX9-NEXT: v_add_u32_e32 v0, 1, v2 |
| ; GFX9-NEXT: ; implicit-def: $vgpr7 |
| ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v7 |
| ; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 |
| ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 14, v1 |
| ; GFX9-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1 |
| ; GFX9-NEXT: ; implicit-def: $vgpr6 |
| ; GFX9-NEXT: v_add3_u32 v2, v6, v1, v0 |
| ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] |
| ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 |
| ; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[4:5] |
| ; GFX9-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5] |
| ; GFX9-NEXT: global_load_ushort v17, v[2:3], off |
| ; GFX9-NEXT: global_load_ushort v18, v[0:1], off |
| ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 |
| ; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11 |
| ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1 |
| ; GFX9-NEXT: ; implicit-def: $vgpr12_vgpr13 |
| ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; GFX9-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5] |
| ; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3] |
| ; GFX9-NEXT: s_barrier |
| ; GFX9-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-NEXT: v_cvt_f32_f16_e32 v14, v14 |
| ; GFX9-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-NEXT: v_cvt_f32_f16_e32 v16, v15 |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: v_cvt_f32_f16_e32 v17, v17 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cvt_f32_f16_e32 v15, v18 |
| ; GFX9-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1] |
| ; GFX9-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1] |
| ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1] |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1] |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1] |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1] |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; GFX9-NEXT: global_store_short v[2:3], v0, off |
| ; GFX9-NEXT: global_store_short v[4:5], v1, off |
| ; |
| ; GFX9-TGS-LABEL: test_workgroup: |
| ; GFX9-TGS: ; %bb.0: |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; GFX9-TGS-NEXT: global_load_ushort v14, v[0:1], off |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; GFX9-TGS-NEXT: global_load_ushort v15, v[4:5], off |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2 |
| ; GFX9-TGS-NEXT: v_add_u32_e32 v0, 1, v2 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr7 |
| ; GFX9-TGS-NEXT: v_mul_hi_u32 v1, v0, v7 |
| ; GFX9-TGS-NEXT: v_sub_u32_e32 v2, v0, v1 |
| ; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v2, 1, v2 |
| ; GFX9-TGS-NEXT: v_add_u32_e32 v1, v2, v1 |
| ; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v1, 14, v1 |
| ; GFX9-TGS-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr6 |
| ; GFX9-TGS-NEXT: v_add3_u32 v2, v6, v1, v0 |
| ; GFX9-TGS-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] |
| ; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 |
| ; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5] |
| ; GFX9-TGS-NEXT: global_load_ushort v18, v[0:1], off |
| ; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[4:5] |
| ; GFX9-TGS-NEXT: global_load_ushort v17, v[0:1], off |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr10_vgpr11 |
| ; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr12_vgpr13 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; GFX9-TGS-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5] |
| ; GFX9-TGS-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3] |
| ; GFX9-TGS-NEXT: s_waitcnt vmcnt(3) |
| ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v14, v14 |
| ; GFX9-TGS-NEXT: s_waitcnt vmcnt(2) |
| ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v16, v15 |
| ; GFX9-TGS-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v15, v18 |
| ; GFX9-TGS-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v17, v17 |
| ; GFX9-TGS-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1] |
| ; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1] |
| ; GFX9-TGS-NEXT: s_nop 0 |
| ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1] |
| ; GFX9-TGS-NEXT: s_barrier |
| ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1] |
| ; GFX9-TGS-NEXT: buffer_inv sc0 |
| ; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1] |
| ; GFX9-TGS-NEXT: s_nop 0 |
| ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1] |
| ; GFX9-TGS-NEXT: s_nop 0 |
| ; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; GFX9-TGS-NEXT: global_store_short v[2:3], v0, off |
| ; GFX9-TGS-NEXT: global_store_short v[4:5], v1, off |
| fence syncscope("workgroup") acq_rel |
| fence syncscope("wavefront") acq_rel |
| ret void |
| } |
| ... |
| |
| --- |
| name: test_workgroup |
| tracksRegLiveness: true |
| machineFunctionInfo: |
| isEntryFunction: true |
| body: | |
| bb.0: |
| %0:sgpr_256 = IMPLICIT_DEF |
| %1:vgpr_32 = IMPLICIT_DEF |
| %2:vreg_96_align2 = IMPLICIT_DEF |
| %3:vgpr_32 = IMPLICIT_DEF |
| %4:vreg_64_align2 = IMPLICIT_DEF |
| %5:vreg_64_align2 = IMPLICIT_DEF |
| %6:vreg_64_align2 = IMPLICIT_DEF |
| %7:sgpr_64 = IMPLICIT_DEF |
| %8:vreg_64_align2 = IMPLICIT_DEF |
| %9:vreg_64_align2 = IMPLICIT_DEF |
| %10:vreg_64_align2 = IMPLICIT_DEF |
| %11:vreg_64_align2 = IMPLICIT_DEF |
| %12:vgpr_32 = IMPLICIT_DEF |
| %13:vgpr_32 = GLOBAL_LOAD_USHORT %5:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) |
| %14:vreg_64_align2 = IMPLICIT_DEF |
| %15:vgpr_32 = GLOBAL_LOAD_USHORT %14:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) |
| %16:vgpr_32 = V_ADD_U32_e32 1, %12:vgpr_32, implicit $exec |
| %17:vgpr_32 = V_MUL_HI_U32_e64 %16:vgpr_32, %3:vgpr_32, implicit $exec |
| %18:vgpr_32 = V_SUB_U32_e32 %16:vgpr_32, %17:vgpr_32, implicit $exec |
| %19:vgpr_32 = V_LSHRREV_B32_e32 1, %18:vgpr_32, implicit $exec |
| %20:vgpr_32 = V_ADD_U32_e32 %19:vgpr_32, %17:vgpr_32, implicit $exec |
| %21:vgpr_32 = V_LSHRREV_B32_e32 14, %20:vgpr_32, implicit $exec |
| %22:vgpr_32 = V_MUL_U32_U24_e32 1361367, %21:vgpr_32, implicit $exec |
| %6.sub0:vreg_64_align2 = V_ADD3_U32_e64 %1:vgpr_32, %22:vgpr_32, %16:vgpr_32, implicit $exec |
| %23:vreg_64_align2 = nuw nsw V_LSHLREV_B64_e64 1, %6:vreg_64_align2, implicit $exec |
| %24:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub2_sub3:sgpr_256, 0, %23:vreg_64_align2, implicit $exec |
| %25:vgpr_32 = GLOBAL_LOAD_USHORT %24:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) |
| %26:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub0_sub1:sgpr_256, 0, %23:vreg_64_align2, implicit $exec |
| %27:vgpr_32 = GLOBAL_LOAD_USHORT %26:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) |
| undef %28.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %13:vgpr_32, implicit $mode, implicit $exec |
| %28.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %25:vgpr_32, implicit $mode, implicit $exec |
| undef %29.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %15:vgpr_32, implicit $mode, implicit $exec |
| %29.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %27:vgpr_32, implicit $mode, implicit $exec |
| %30:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %29:vreg_64_align2, 11, %2.sub0_sub1:vreg_96_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| %31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, %7:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| %32:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %9:vreg_64_align2, 8, %30:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| %33:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %11:vreg_64_align2, 8, %32:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| %34:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %31:vreg_64_align2, 8, %33:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| %35:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %8:vreg_64_align2, 8, %34:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
| ATOMIC_FENCE 5, 2 |
| S_BARRIER |
| ATOMIC_FENCE 4, 2 |
| %36:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub0:vreg_64_align2, implicit $mode, implicit $exec |
| %37:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %4:vreg_64_align2, implicit $exec |
| GLOBAL_STORE_SHORT %37:vreg_64_align2, %36:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1) |
| %38:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub1:vreg_64_align2, implicit $mode, implicit $exec |
| %39:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %23:vreg_64_align2, implicit $exec |
| GLOBAL_STORE_SHORT %39:vreg_64_align2, %38:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1) |
| |
| ... |