blob: 113566ae6aad3830422f3664e26d801f0cd30d4f [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "load" --filter-out "store" --version 6
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6,GFX6-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-GISEL %s
; Minimum offset
define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
; GFX6-LABEL: gws_init_offset0:
; GFX6: ; %bb.0:
; GFX6: s_mov_b32 m0, 0
; GFX6: s_waitcnt lgkmcnt(0)
; GFX6: v_mov_b32_e32 v0, s0
; GFX6: .LBB0_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s0, 0
; GFX6: s_cbranch_scc1 .LBB0_1
; GFX6: ; %bb.2:
; GFX6: s_endpgm
;
; GCN-LABEL: gws_init_offset0:
; GCN: ; %bb.0:
; GCN: s_mov_b32 m0, 0
; GCN: s_waitcnt lgkmcnt(0)
; GCN: v_mov_b32_e32 v0, s0
; GCN: ds_gws_init v0 gds
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_endpgm
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
ret void
}
; Maximum offset
define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
; GFX6-LABEL: gws_init_offset63:
; GFX6: ; %bb.0:
; GFX6: s_mov_b32 m0, 0
; GFX6: s_waitcnt lgkmcnt(0)
; GFX6: v_mov_b32_e32 v0, s0
; GFX6: .LBB1_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 offset:63 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s0, 0
; GFX6: s_cbranch_scc1 .LBB1_1
; GFX6: ; %bb.2:
; GFX6: s_endpgm
;
; GCN-LABEL: gws_init_offset63:
; GCN: ; %bb.0:
; GCN: s_mov_b32 m0, 0
; GCN: s_waitcnt lgkmcnt(0)
; GCN: v_mov_b32_e32 v0, s0
; GCN: ds_gws_init v0 offset:63 gds
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_endpgm
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
ret void
}
; FIXME: Should be able to shift directly into m0
define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
; GFX6-LABEL: gws_init_sgpr_offset:
; GFX6: ; %bb.0:
; GFX6: s_waitcnt lgkmcnt(0)
; GFX6: v_mov_b32_e32 v0, s0
; GFX6: s_lshl_b32 m0, s1, 16
; GFX6: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s0, 0
; GFX6: s_cbranch_scc1 .LBB2_1
; GFX6: ; %bb.2:
; GFX6: s_endpgm
;
; GFX9-SDAG-LABEL: gws_init_sgpr_offset:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG: s_waitcnt lgkmcnt(0)
; GFX9-SDAG: s_lshl_b32 m0, s1, 16
; GFX9-SDAG: v_mov_b32_e32 v0, s0
; GFX9-SDAG: ds_gws_init v0 gds
; GFX9-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG: s_endpgm
;
; GFX9-GISEL-LABEL: gws_init_sgpr_offset:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL: s_waitcnt lgkmcnt(0)
; GFX9-GISEL: v_mov_b32_e32 v0, s0
; GFX9-GISEL: s_lshl_b32 m0, s1, 16
; GFX9-GISEL: s_nop 0
; GFX9-GISEL: ds_gws_init v0 gds
; GFX9-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL: s_endpgm
;
; GFX1011-LABEL: gws_init_sgpr_offset:
; GFX1011: ; %bb.0:
; GFX1011: s_waitcnt lgkmcnt(0)
; GFX1011: v_mov_b32_e32 v0, s0
; GFX1011: s_lshl_b32 m0, s1, 16
; GFX1011: ds_gws_init v0 gds
; GFX1011: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1011: s_endpgm
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
ret void
}
; Variable offset in SGPR with constant add
define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
; GFX6-LABEL: gws_init_sgpr_offset_add1:
; GFX6: ; %bb.0:
; GFX6: s_waitcnt lgkmcnt(0)
; GFX6: v_mov_b32_e32 v0, s0
; GFX6: s_lshl_b32 m0, s1, 16
; GFX6: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 offset:1 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s0, 0
; GFX6: s_cbranch_scc1 .LBB3_1
; GFX6: ; %bb.2:
; GFX6: s_endpgm
;
; GFX9-SDAG-LABEL: gws_init_sgpr_offset_add1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG: s_waitcnt lgkmcnt(0)
; GFX9-SDAG: s_lshl_b32 m0, s1, 16
; GFX9-SDAG: v_mov_b32_e32 v0, s0
; GFX9-SDAG: ds_gws_init v0 offset:1 gds
; GFX9-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG: s_endpgm
;
; GFX9-GISEL-LABEL: gws_init_sgpr_offset_add1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL: s_waitcnt lgkmcnt(0)
; GFX9-GISEL: v_mov_b32_e32 v0, s0
; GFX9-GISEL: s_lshl_b32 m0, s1, 16
; GFX9-GISEL: s_nop 0
; GFX9-GISEL: ds_gws_init v0 offset:1 gds
; GFX9-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL: s_endpgm
;
; GFX1011-LABEL: gws_init_sgpr_offset_add1:
; GFX1011: ; %bb.0:
; GFX1011: s_waitcnt lgkmcnt(0)
; GFX1011: v_mov_b32_e32 v0, s0
; GFX1011: s_lshl_b32 m0, s1, 16
; GFX1011: ds_gws_init v0 offset:1 gds
; GFX1011: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1011: s_endpgm
%offset = add i32 %offset.base, 1
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
ret void
}
define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
; GFX6-LABEL: gws_init_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6: v_readfirstlane_b32 s1, v0
; GFX6: s_lshl_b32 m0, s1, 16
; GFX6: s_waitcnt lgkmcnt(0)
; GFX6: v_mov_b32_e32 v0, s0
; GFX6: .LBB4_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s0, 0
; GFX6: s_cbranch_scc1 .LBB4_1
; GFX6: ; %bb.2:
; GFX6: s_endpgm
;
; GFX9-LABEL: gws_init_vgpr_offset:
; GFX9: ; %bb.0:
; GFX9: v_readfirstlane_b32 s1, v0
; GFX9: s_lshl_b32 m0, s1, 16
; GFX9: s_waitcnt lgkmcnt(0)
; GFX9: v_mov_b32_e32 v0, s0
; GFX9: ds_gws_init v0 gds
; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9: s_endpgm
;
; GFX10-LABEL: gws_init_vgpr_offset:
; GFX10: ; %bb.0:
; GFX10: v_readfirstlane_b32 s1, v0
; GFX10: s_lshl_b32 m0, s1, 16
; GFX10: s_waitcnt lgkmcnt(0)
; GFX10: v_mov_b32_e32 v0, s0
; GFX10: ds_gws_init v0 gds
; GFX10: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10: s_endpgm
;
; GFX11-LABEL: gws_init_vgpr_offset:
; GFX11: ; %bb.0:
; GFX11: v_and_b32_e32 v0, 0x3ff, v0
; GFX11: s_delay_alu instid0(VALU_DEP_1)
; GFX11: v_readfirstlane_b32 s1, v0
; GFX11: s_lshl_b32 m0, s1, 16
; GFX11: s_waitcnt lgkmcnt(0)
; GFX11: v_mov_b32_e32 v0, s0
; GFX11: ds_gws_init v0 gds
; GFX11: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11: s_endpgm
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
ret void
}
; Variable offset in VGPR with constant add
define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
; GFX6-SDAG-LABEL: gws_init_vgpr_offset_add:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG: v_readfirstlane_b32 s1, v0
; GFX6-SDAG: s_lshl_b32 m0, s1, 16
; GFX6-SDAG: s_waitcnt lgkmcnt(0)
; GFX6-SDAG: v_mov_b32_e32 v0, s0
; GFX6-SDAG: .LBB5_1: ; =>This Inner Loop Header: Depth=1
; GFX6-SDAG: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-SDAG: ds_gws_init v0 offset:3 gds
; GFX6-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-SDAG: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-SDAG: s_cmp_lg_u32 s0, 0
; GFX6-SDAG: s_cbranch_scc1 .LBB5_1
; GFX6-SDAG: ; %bb.2:
; GFX6-SDAG: s_endpgm
;
; GFX6-GISEL-LABEL: gws_init_vgpr_offset_add:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL: v_add_i32_e32 v0, vcc, 3, v0
; GFX6-GISEL: v_readfirstlane_b32 s1, v0
; GFX6-GISEL: s_lshl_b32 m0, s1, 16
; GFX6-GISEL: s_waitcnt lgkmcnt(0)
; GFX6-GISEL: v_mov_b32_e32 v0, s0
; GFX6-GISEL: .LBB5_1: ; =>This Inner Loop Header: Depth=1
; GFX6-GISEL: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-GISEL: ds_gws_init v0 gds
; GFX6-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-GISEL: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-GISEL: s_cmp_lg_u32 s0, 0
; GFX6-GISEL: s_cbranch_scc1 .LBB5_1
; GFX6-GISEL: ; %bb.2:
; GFX6-GISEL: s_endpgm
;
; GFX9-SDAG-LABEL: gws_init_vgpr_offset_add:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG: v_readfirstlane_b32 s1, v0
; GFX9-SDAG: s_lshl_b32 m0, s1, 16
; GFX9-SDAG: s_waitcnt lgkmcnt(0)
; GFX9-SDAG: v_mov_b32_e32 v0, s0
; GFX9-SDAG: ds_gws_init v0 offset:3 gds
; GFX9-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG: s_endpgm
;
; GFX9-GISEL-LABEL: gws_init_vgpr_offset_add:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL: v_add_u32_e32 v0, 3, v0
; GFX9-GISEL: v_readfirstlane_b32 s1, v0
; GFX9-GISEL: s_lshl_b32 m0, s1, 16
; GFX9-GISEL: s_waitcnt lgkmcnt(0)
; GFX9-GISEL: v_mov_b32_e32 v0, s0
; GFX9-GISEL: ds_gws_init v0 gds
; GFX9-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL: s_endpgm
;
; GFX10-SDAG-LABEL: gws_init_vgpr_offset_add:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG: v_readfirstlane_b32 s1, v0
; GFX10-SDAG: s_lshl_b32 m0, s1, 16
; GFX10-SDAG: s_waitcnt lgkmcnt(0)
; GFX10-SDAG: v_mov_b32_e32 v0, s0
; GFX10-SDAG: ds_gws_init v0 offset:3 gds
; GFX10-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG: s_endpgm
;
; GFX10-GISEL-LABEL: gws_init_vgpr_offset_add:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL: v_add_nc_u32_e32 v0, 3, v0
; GFX10-GISEL: v_readfirstlane_b32 s1, v0
; GFX10-GISEL: s_lshl_b32 m0, s1, 16
; GFX10-GISEL: s_waitcnt lgkmcnt(0)
; GFX10-GISEL: v_mov_b32_e32 v0, s0
; GFX10-GISEL: ds_gws_init v0 gds
; GFX10-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL: s_endpgm
;
; GFX11-SDAG-LABEL: gws_init_vgpr_offset_add:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-SDAG: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG: v_readfirstlane_b32 s1, v0
; GFX11-SDAG: s_lshl_b32 m0, s1, 16
; GFX11-SDAG: s_waitcnt lgkmcnt(0)
; GFX11-SDAG: v_mov_b32_e32 v0, s0
; GFX11-SDAG: ds_gws_init v0 offset:3 gds
; GFX11-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG: s_endpgm
;
; GFX11-GISEL-LABEL: gws_init_vgpr_offset_add:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GISEL: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL: v_add_nc_u32_e32 v0, 3, v0
; GFX11-GISEL: v_readfirstlane_b32 s1, v0
; GFX11-GISEL: s_lshl_b32 m0, s1, 16
; GFX11-GISEL: s_waitcnt lgkmcnt(0)
; GFX11-GISEL: v_mov_b32_e32 v0, s0
; GFX11-GISEL: ds_gws_init v0 gds
; GFX11-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL: s_endpgm
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
%vgpr.offset = add i32 %vgpr.offset.base, 3
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
ret void
}
@lds = internal unnamed_addr addrspace(3) global i32 poison
; Check if m0 initialization is shared.
define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
; GFX6-SDAG-LABEL: gws_init_save_m0_init_constant_offset:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG: v_mov_b32_e32 v1, 1
; GFX6-SDAG: v_mov_b32_e32 v0, 0
; GFX6-SDAG: s_mov_b32 m0, -1
; GFX6-SDAG: ds_write_b32 v0, v1
; GFX6-SDAG: s_waitcnt lgkmcnt(0)
; GFX6-SDAG: v_mov_b32_e32 v1, s0
; GFX6-SDAG: s_mov_b32 m0, 0
; GFX6-SDAG: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GFX6-SDAG: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-SDAG: ds_gws_init v1 offset:10 gds
; GFX6-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-SDAG: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-SDAG: s_cmp_lg_u32 s0, 0
; GFX6-SDAG: s_cbranch_scc1 .LBB6_1
; GFX6-SDAG: ; %bb.2:
; GFX6-SDAG: v_mov_b32_e32 v1, 2
; GFX6-SDAG: s_mov_b32 m0, -1
; GFX6-SDAG: ds_write_b32 v0, v1
; GFX6-SDAG: s_endpgm
;
; GFX6-GISEL-LABEL: gws_init_save_m0_init_constant_offset:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL: v_mov_b32_e32 v0, 1
; GFX6-GISEL: v_mov_b32_e32 v1, 0
; GFX6-GISEL: s_mov_b32 m0, -1
; GFX6-GISEL: ds_write_b32 v1, v0
; GFX6-GISEL: s_waitcnt lgkmcnt(0)
; GFX6-GISEL: v_mov_b32_e32 v0, s0
; GFX6-GISEL: s_mov_b32 m0, 0
; GFX6-GISEL: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GFX6-GISEL: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-GISEL: ds_gws_init v0 offset:10 gds
; GFX6-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-GISEL: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-GISEL: s_cmp_lg_u32 s0, 0
; GFX6-GISEL: s_cbranch_scc1 .LBB6_1
; GFX6-GISEL: ; %bb.2:
; GFX6-GISEL: v_mov_b32_e32 v0, 2
; GFX6-GISEL: v_mov_b32_e32 v1, 0
; GFX6-GISEL: s_mov_b32 m0, -1
; GFX6-GISEL: ds_write_b32 v1, v0
; GFX6-GISEL: s_endpgm
;
; GFX9-LABEL: gws_init_save_m0_init_constant_offset:
; GFX9: ; %bb.0:
; GFX9: v_mov_b32_e32 v0, 1
; GFX9: v_mov_b32_e32 v1, 0
; GFX9: ds_write_b32 v1, v0
; GFX9: s_mov_b32 m0, 0
; GFX9: s_waitcnt lgkmcnt(0)
; GFX9: v_mov_b32_e32 v0, s0
; GFX9: ds_gws_init v0 offset:10 gds
; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9: v_mov_b32_e32 v0, 2
; GFX9: ds_write_b32 v1, v0
; GFX9: s_endpgm
;
; GFX10-LABEL: gws_init_save_m0_init_constant_offset:
; GFX10: ; %bb.0:
; GFX10: v_mov_b32_e32 v0, 1
; GFX10: v_mov_b32_e32 v1, 0
; GFX10: v_mov_b32_e32 v3, 2
; GFX10: s_mov_b32 m0, 0
; GFX10: ds_write_b32 v1, v0
; GFX10: s_waitcnt lgkmcnt(0)
; GFX10: v_mov_b32_e32 v2, s0
; GFX10: ds_gws_init v2 offset:10 gds
; GFX10: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10: ds_write_b32 v1, v3
; GFX10: s_endpgm
;
; GFX11-LABEL: gws_init_save_m0_init_constant_offset:
; GFX11: ; %bb.0:
; GFX11: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 0
; GFX11: s_mov_b32 m0, 0
; GFX11: v_mov_b32_e32 v3, 2
; GFX11: s_waitcnt lgkmcnt(0)
; GFX11: v_mov_b32_e32 v2, s0
; GFX11: ds_gws_init v2 offset:10 gds
; GFX11: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11: s_endpgm
store volatile i32 1, ptr addrspace(3) @lds
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
store i32 2, ptr addrspace(3) @lds
ret void
}
define void @gws_init_lgkmcnt(i32 %val) {
; GFX6-LABEL: gws_init_lgkmcnt:
; GFX6: ; %bb.0:
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_mov_b32 m0, 0
; GFX6: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX6: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6: ds_gws_init v0 gds
; GFX6: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6: s_getreg_b32 s4, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6: s_cmp_lg_u32 s4, 0
; GFX6: s_cbranch_scc1 .LBB7_1
; GFX6: ; %bb.2:
; GFX6: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: gws_init_lgkmcnt:
; GFX9: ; %bb.0:
; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9: s_mov_b32 m0, 0
; GFX9: s_nop 0
; GFX9: ds_gws_init v0 gds
; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9: s_setpc_b64 s[30:31]
;
; GFX1011-LABEL: gws_init_lgkmcnt:
; GFX1011: ; %bb.0:
; GFX1011: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1011: s_mov_b32 m0, 0
; GFX1011: ds_gws_init v0 gds
; GFX1011: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1011: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
ret void
}
; Does not imply memory fence on its own
define amdgpu_kernel void @gws_init_wait_before(i32 %val, ptr addrspace(1) %ptr) #0 {
; GFX6-SDAG-LABEL: gws_init_wait_before:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG: s_mov_b32 s3, 0x100f000
; GFX6-SDAG: s_mov_b32 s2, -1
; GFX6-SDAG: v_mov_b32_e32 v0, 0
; GFX6-SDAG: s_waitcnt lgkmcnt(0)
; GFX6-SDAG: s_waitcnt expcnt(0)
; GFX6-SDAG: v_mov_b32_e32 v0, s4
; GFX6-SDAG: s_mov_b32 m0, 0
; GFX6-SDAG: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX6-SDAG: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-SDAG: ds_gws_init v0 offset:7 gds
; GFX6-SDAG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-SDAG: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-SDAG: s_cmp_lg_u32 s0, 0
; GFX6-SDAG: s_cbranch_scc1 .LBB8_1
; GFX6-SDAG: ; %bb.2:
; GFX6-SDAG: s_endpgm
;
; GFX6-GISEL-LABEL: gws_init_wait_before:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL: v_mov_b32_e32 v0, 0
; GFX6-GISEL: s_mov_b32 s2, -1
; GFX6-GISEL: s_mov_b32 s3, 0x100f000
; GFX6-GISEL: s_waitcnt lgkmcnt(0)
; GFX6-GISEL: s_waitcnt expcnt(0)
; GFX6-GISEL: v_mov_b32_e32 v0, s4
; GFX6-GISEL: s_mov_b32 m0, 0
; GFX6-GISEL: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX6-GISEL: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
; GFX6-GISEL: ds_gws_init v0 offset:7 gds
; GFX6-GISEL: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-GISEL: s_getreg_b32 s0, hwreg(HW_REG_TRAPSTS, 8, 1)
; GFX6-GISEL: s_cmp_lg_u32 s0, 0
; GFX6-GISEL: s_cbranch_scc1 .LBB8_1
; GFX6-GISEL: ; %bb.2:
; GFX6-GISEL: s_endpgm
;
; GFX9-LABEL: gws_init_wait_before:
; GFX9: ; %bb.0:
; GFX9: v_mov_b32_e32 v0, 0
; GFX9: s_mov_b32 m0, 0
; GFX9: s_waitcnt lgkmcnt(0)
; GFX9: v_mov_b32_e32 v0, s2
; GFX9: ds_gws_init v0 offset:7 gds
; GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9: s_endpgm
;
; GFX1011-LABEL: gws_init_wait_before:
; GFX1011: ; %bb.0:
; GFX1011: s_clause 0x1
; GFX1011: v_mov_b32_e32 v0, 0
; GFX1011: s_mov_b32 m0, 0
; GFX1011: s_waitcnt lgkmcnt(0)
; GFX1011: v_mov_b32_e32 v1, s2
; GFX1011: ds_gws_init v1 offset:7 gds
; GFX1011: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1011: s_endpgm
store i32 0, ptr addrspace(1) %ptr
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
ret void
}
declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind }
attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
attributes #2 = { nounwind readnone speculatable }