| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s |
| ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s |
| ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s |
| |
| ; Test using saddr addressing mode of global_*load_* flat instructions. |
| |
| ; -------------------------------------------------------------------------------- |
| ; No vgpr offset, constants |
| ; -------------------------------------------------------------------------------- |
| |
| ; SGPR base only |
| define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sbase) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_0: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_0: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %load = load i8, ptr addrspace(1) %sbase |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx9 immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_4095: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_4095: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_4095: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_4095: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0xfff |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx9 immediate offset + 1 |
| define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg %sbase) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_4096: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_4096: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_4096: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x1000 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx9 immediate offset + 2 |
| define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg %sbase) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_4097: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_4097: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_4097: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x1001 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx9 immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg4096: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg4096: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4096: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf000 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4096: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff000 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx9 immediate offset -1 |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff |
| ; GFX9-NEXT: s_addc_u32 s1, s3, -1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg4097: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg4097: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4097: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xefff |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4097: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffefff |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx9 immediate offset -2 |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe |
| ; GFX9-NEXT: s_addc_u32 s1, s3, -1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg4098: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg4098: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg4098: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xeffe |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg4098: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xffffeffe |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx10 immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_2048: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_2048: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_2048: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_2048: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x800 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx10 immediate offset + 1 |
| define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_2049: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_2049: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_2049: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_2049: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x801 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum gfx10 immediate offset + 2 |
| define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_2050: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_2050: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_2050: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_2050: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x802 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx10 immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inreg %sbase) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_neg2048: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg2048: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2048: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf800 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2048: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff800 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx10 immediate offset - 1 |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg2049: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg2049: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2049: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7ff |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2049: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7ff |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; SGPR base with maximum negative gfx10 immediate offset - 1 |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg2050: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg2050: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg2050: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xf7fe |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg2050: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfffff7fe |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0x7FFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff000 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0x7FFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7ff800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0x7FFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7ff000 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_0x7FFFFF: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], 0x7fffff |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_u32 s0, s2, 0xff800000 |
| ; GFX9-NEXT: s_addc_u32 s1, s3, -1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xff800000, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xff800000, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0xFFFFFF: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_mov_b32 s0, 0xff800000 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0xFFFFFF: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff800000 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_mov_b32 s0, -1 |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], s0 offset:0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_i32 s3, s3, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_add_i32 s3, s3, 1 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: s_add_i32 s3, s3, 1 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1 |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000000: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000001: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000001: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000001: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1 |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000001: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000FFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_u32 s0, s2, 0xfff |
| ; GFX9-NEXT: s_addc_u32 s1, s3, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000FFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000FFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000FFF: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xfff |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, 1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100000FFF: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_0x100001000: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 |
| ; GFX9-NEXT: s_addc_u32 s1, s3, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_0x100001000: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_0x100001000: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100001000: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1000 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, 1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_0x100001000: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_mov_b32 s0, 1 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 1 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_add_i32 s3, s3, -1 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_add_i32 s3, s3, -1 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: s_add_i32 s3, s3, -1 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1 |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace(1) inreg %sbase) { |
| ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000001: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000001: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000001: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000001: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_mov_b32 s0, -1 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s1, -2 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -1 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297 |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Basic addressing patterns |
| ; -------------------------------------------------------------------------------- |
| |
| ; Basic pattern, no immediate offset. |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum positive offset on gfx9 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum positive offset on gfx9 + 1 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4096 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum negative offset on gfx9 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum negative offset on gfx9 - 1 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4097 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum positive offset on gfx10 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum positive offset on gfx10 + 1 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum negative offset on gfx10 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum negative offset on gfx10 - 1 |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Maximum positive offset on gfx12. |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64 |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x7ff800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x7ff000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8388607 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Minimum offset on gfx12. |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64 |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xff800000, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8388608 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| |
| ; Maximum positive offset on gfx9, and immediate needs to be moved lower. |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095 |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; pointer addressing done in integers |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add = add i64 %sbase.as.int, %zext.offset |
| %dirty.gep = inttoptr i64 %add to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; zext forced to LHS of addressing expression |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add = add i64 %zext.offset, %sbase.as.int |
| %dirty.gep = inttoptr i64 %add to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; zext forced to LHS of addressing expression, with immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add = add i64 %zext.offset, %sbase.as.int |
| %add.immoffset = add i64 %add, 128 |
| %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; zext forced to LHS of addressing expression, with immediate offset in non-canonical position |
| define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add.immoffset = add i64 %sbase.as.int, 128 |
| %add = add i64 %zext.offset, %add.immoffset |
| %dirty.gep = inttoptr i64 %add to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Uniformity edge cases |
| ; -------------------------------------------------------------------------------- |
| |
| @ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef |
| |
| ; Base pointer is uniform, but also in VGPRs |
| define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: ds_read_b64 v[1:2], v1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX9-NEXT: s_nop 4 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: ds_read_b64 v[1:2], v1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Base pointer is uniform, but also in VGPRs, with imm offset |
| define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { |
| ; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: ds_read_b64 v[1:2], v1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX9-NEXT: s_nop 4 |
| ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: ds_read_b64 v[1:2], v1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0 |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Both 64-bit base and 32-bit offset are scalar |
| define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, s4 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_uniform_offset: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Both 64-bit base and 32-bit offset are scalar, with immediate offset. |
| define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, s4 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: s_mov_b32 s5, 0 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[4:5] |
| ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xffe8 |
| ; GFX12-SDAG-NEXT: s_mov_b32 s3, -1 |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] |
| ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, s4 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 |
| ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffffe8 |
| ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 |
| ; GFX12-GISEL-NEXT: s_load_u8 s0, s[0:1], 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Both components uniform, zext forced to LHS of addressing expression |
| define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, s4 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add = add i64 %zext.offset, %sbase.as.int |
| %dirty.gep = inttoptr i64 %add to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset |
| define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) { |
| ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, s4 |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_u8 s0, s[2:3], s4 offset:0x80 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64 |
| %add = add i64 %zext.offset, %sbase.as.int |
| %add.immoffset = add i64 %add, 128 |
| %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %dirty.gep |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; divergent 64-bit base, 32-bit scalar offset. |
| define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 inreg %soffset) { |
| ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_i8_vgpr64_sgpr32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_mov_b32 s3, 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; divergent 64-bit base, 32-bit scalar offset, with imm offset |
| define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 inreg %soffset) { |
| ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2 |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: s_mov_b32 s3, 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2 |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %soffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Natural addressing shifts with restricted range |
| ; -------------------------------------------------------------------------------- |
| |
| ; Cannot push the shift into 32-bits, and cannot match. |
| define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) { |
| ; GFX9-LABEL: global_load_saddr_f32_natural_addressing: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_saddr_f32_natural_addressing: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f32_natural_addressing: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_f32_natural_addressing: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc |
| ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %voffset = load i32, ptr addrspace(1) %voffset.ptr |
| %zext.offset = zext i32 %voffset to i64 |
| %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load float, ptr addrspace(1) %gep |
| ret float %load |
| } |
| |
| ; Cannot push the shift into 32-bits, with an immediate offset. |
| define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) { |
| ; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v[0:1], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f32_natural_addressing_immoffset: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %voffset = load i32, ptr addrspace(1) %voffset.ptr |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128 |
| %load = load float, ptr addrspace(1) %gep1 |
| ret float %load |
| } |
| |
| ; Range is sufficiently restricted to push the shift into 32-bits. |
| define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) { |
| ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v[0:1], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} |
| %zext.offset = zext i32 %voffset to i64 |
| %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load float, ptr addrspace(1) %gep |
| ret float %load |
| } |
| |
| ; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset |
| define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) { |
| ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v[0:1], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{} |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100 |
| %load = load float, ptr addrspace(1) %gep1 |
| ret float %load |
| } |
| |
| ; Range is 1 beyond the limit where we can move the shift into 32-bits. |
| define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) { |
| ; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 |
| ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] |
| ; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] |
| ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc |
| ; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc |
| ; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{} |
| %zext.offset = zext i32 %voffset to i64 |
| %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load float, ptr addrspace(1) %gep |
| ret float %load |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Stress various type loads |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %cast.load = bitcast i16 %load to half |
| ret half %cast.load |
| } |
| |
| define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %cast.load = bitcast i16 %load to half |
| ret half %cast.load |
| } |
| |
| define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load half, ptr addrspace(1) %gep0 |
| ret half %load |
| } |
| |
| define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load half, ptr addrspace(1) %gep1 |
| ret half %load |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i32, ptr addrspace(1) %gep0 |
| %cast.load = bitcast i32 %load to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i32, ptr addrspace(1) %gep1 |
| %cast.load = bitcast i32 %load to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load float, ptr addrspace(1) %gep0 |
| ret float %load |
| } |
| |
| define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load float, ptr addrspace(1) %gep1 |
| ret float %load |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x i16>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <2 x i16> %load to <2 x half> |
| ret <2 x half> %cast.load |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x i16>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <2 x i16> %load to <2 x half> |
| ret <2 x half> %cast.load |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2f16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x half>, ptr addrspace(1) %gep0 |
| ret <2 x half> %load |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2f16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2f16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2f16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x half>, ptr addrspace(1) %gep1 |
| ret <2 x half> %load |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_p3: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_p3: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_p3: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load ptr addrspace(3), ptr addrspace(1) %gep0 |
| %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 |
| %cast.load1 = bitcast i32 %cast.load0 to <2 x half> |
| ret <2 x half> %cast.load1 |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_p3_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_p3_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_p3_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load ptr addrspace(3), ptr addrspace(1) %gep1 |
| %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 |
| %cast.load1 = bitcast i32 %cast.load0 to <2 x half> |
| ret <2 x half> %cast.load1 |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load double, ptr addrspace(1) %gep0 |
| %cast.load = bitcast double %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_f64_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_f64_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_f64_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load double, ptr addrspace(1) %gep1 |
| %cast.load = bitcast double %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i64, ptr addrspace(1) %gep0 |
| %cast.load = bitcast i64 %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i64_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i64_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i64_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i64, ptr addrspace(1) %gep1 |
| %cast.load = bitcast i64 %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x float>, ptr addrspace(1) %gep0 |
| ret <2 x float> %load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2f32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2f32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2f32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x float>, ptr addrspace(1) %gep1 |
| ret <2 x float> %load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x i32>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <2 x i32> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x i32>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <2 x i32> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4i16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4i16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <4 x i16>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <4 x i16> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4i16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4i16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4i16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <4 x i16>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <4 x i16> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4f16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <4 x half>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <4 x half> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4f16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4f16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4f16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <4 x half>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <4 x half> %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_p1: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_p1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_p1: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load ptr addrspace(1), ptr addrspace(1) %gep0 |
| %cast.load0 = ptrtoint ptr addrspace(1) %load to i64 |
| %cast.load1 = bitcast i64 %cast.load0 to <2 x float> |
| ret <2 x float> %cast.load1 |
| } |
| |
| define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_p1_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_p1_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_p1_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load ptr addrspace(1), ptr addrspace(1) %gep1 |
| %cast.load0 = ptrtoint ptr addrspace(1) %load to i64 |
| %cast.load1 = bitcast i64 %cast.load0 to <2 x float> |
| ret <2 x float> %cast.load1 |
| } |
| |
| define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v3f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v3f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v3f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <3 x float>, ptr addrspace(1) %gep0 |
| ret <3 x float> %load |
| } |
| |
| define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v3f32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v3f32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v3f32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <3 x float>, ptr addrspace(1) %gep1 |
| ret <3 x float> %load |
| } |
| |
| define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v3i32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v3i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v3i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <3 x i32>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <3 x i32> %load to <3 x float> |
| ret <3 x float> %cast.load |
| } |
| |
| define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v3i32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v3i32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v3i32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <3 x i32>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <3 x i32> %load to <3 x float> |
| ret <3 x float> %cast.load |
| } |
| |
| define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v6f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v6f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v6f16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <6 x half>, ptr addrspace(1) %gep0 |
| ret <6 x half> %load |
| } |
| |
| define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v6f16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v6f16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v6f16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <6 x half>, ptr addrspace(1) %gep1 |
| ret <6 x half> %load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <4 x float>, ptr addrspace(1) %gep0 |
| ret <4 x float> %load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4f32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4f32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4f32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <4 x float>, ptr addrspace(1) %gep1 |
| ret <4 x float> %load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4i32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <4 x i32>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <4 x i32> %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4i32_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4i32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4i32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <4 x i32>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <4 x i32> %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x i64>, ptr addrspace(1) %gep0 |
| %cast.load = bitcast <2 x i64> %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2i64_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2i64_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2i64_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x i64>, ptr addrspace(1) %gep1 |
| %cast.load = bitcast <2 x i64> %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i128, ptr addrspace(1) %gep0 |
| %cast.load = bitcast i128 %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i128_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i128_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i128_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i128, ptr addrspace(1) %gep1 |
| %cast.load = bitcast i128 %load to <4 x float> |
| ret <4 x float> %cast.load |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2p1: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2p1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2p1: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0 |
| %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64> |
| %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> |
| ret <4 x float> %cast.load1 |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v2p1_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v2p1_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v2p1_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep1 |
| %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64> |
| %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> |
| ret <4 x float> %cast.load1 |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4p3: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4p3: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4p3: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0 |
| %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> |
| %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> |
| ret <4 x float> %cast.load1 |
| } |
| |
| define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_v4p3_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_v4p3_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_v4p3_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep1 |
| %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> |
| %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> |
| ret <4 x float> %cast.load1 |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Extending loads |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_sextload_saddr_i8: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_sextload_saddr_i8: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_sextload_saddr_i8: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %sextload = sext i8 %load to i32 |
| %cast.load = bitcast i32 %sextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_sextload_saddr_i8_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_sextload_saddr_i8_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_sextload_saddr_i8_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %sextload = sext i8 %load to i32 |
| %cast.load = bitcast i32 %sextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_sextload_saddr_i16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_sextload_saddr_i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_sextload_saddr_i16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %sextload = sext i16 %load to i32 |
| %cast.load = bitcast i32 %sextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_sextload_saddr_i16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_sextload_saddr_i16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_sextload_saddr_i16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %sextload = sext i16 %load to i32 |
| %cast.load = bitcast i32 %sextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_zextload_saddr_i8: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_zextload_saddr_i8: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_zextload_saddr_i8: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zextload = zext i8 %load to i32 |
| %cast.load = bitcast i32 %zextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_zextload_saddr_i8_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_zextload_saddr_i8_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_zextload_saddr_i8_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zextload = zext i8 %load to i32 |
| %cast.load = bitcast i32 %zextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_zextload_saddr_i16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_zextload_saddr_i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_zextload_saddr_i16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %zextload = zext i16 %load to i32 |
| %cast.load = bitcast i32 %zextload to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_zextload_saddr_i16_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_zextload_saddr_i16_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_zextload_saddr_i16_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %zextload = zext i16 %load to i32 |
| %cast.load = bitcast i32 %zextload to float |
| ret float %cast.load |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Atomic load |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: atomic_global_load_saddr_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: atomic_global_load_saddr_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: atomic_global_load_saddr_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: atomic_global_load_saddr_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_inv scope:SCOPE_SYS |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4 |
| %cast.load = bitcast i32 %load to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: atomic_global_load_saddr_i32_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_inv scope:SCOPE_SYS |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load atomic i32, ptr addrspace(1) %gep1 seq_cst, align 4 |
| %cast.load = bitcast i32 %load to float |
| ret float %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: atomic_global_load_saddr_i64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: atomic_global_load_saddr_i64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: atomic_global_load_saddr_i64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: atomic_global_load_saddr_i64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_inv scope:SCOPE_SYS |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8 |
| %cast.load = bitcast i64 %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_wbinvl1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_gl1_inv |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: buffer_gl1_inv |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: atomic_global_load_saddr_i64_immneg128: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_inv scope:SCOPE_SYS |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load atomic i64, ptr addrspace(1) %gep1 seq_cst, align 8 |
| %cast.load = bitcast i64 %load to <2 x float> |
| ret <2 x float> %cast.load |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; D16 load (low 16) |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> undef, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> undef, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> %reg, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> %reg, i16 %load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext.load = zext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext.load = zext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %sext.load = sext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %sext.load = sext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; D16 hi load (hi16) |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> undef, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> undef, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i16, ptr addrspace(1) %gep0 |
| %build = insertelement <2 x i16> %reg, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i16, ptr addrspace(1) %gep1 |
| %build = insertelement <2 x i16> %reg, i16 %load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %zext.load = zext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %zext.load = zext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %load = load i8, ptr addrspace(1) %gep0 |
| %sext.load = sext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) { |
| ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: |
| ; GFX12-SDAG: ; %bb.0: |
| ; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX12-SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: |
| ; GFX12-GISEL: ; %bb.0: |
| ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 |
| ; GFX12-GISEL-NEXT: ; return to shader part epilog |
| %zext.offset = zext i32 %voffset to i64 |
| %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset |
| %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 |
| %load = load i8, ptr addrspace(1) %gep1 |
| %sext.load = sext i8 %load to i16 |
| %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 |
| %cast = bitcast <2 x i16> %build to <2 x half> |
| ret <2 x half> %cast |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; or-with-constant as add |
| ; -------------------------------------------------------------------------------- |
| |
| ; Check add-as-or with split 64-bit or. |
| define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_or_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_or_b32_e32 v0, 16, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_or_b32_e32 v0, 16, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.idx = zext i32 %idx to i64 |
| %or = or i64 %zext.idx, 16 |
| %addr = inttoptr i64 %or to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %addr |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) { |
| ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_ubyte v0, v[0:1], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_or_b32_e32 v0, 0x1040, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_or_b32_e32 v0, 0x1040, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-NEXT: global_load_u8 v0, v[0:1], off |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: ; return to shader part epilog |
| %zext.idx = zext i32 %idx to i64 |
| %or = or i64 %zext.idx, 4160 |
| %addr = inttoptr i64 %or to ptr addrspace(1) |
| %load = load i8, ptr addrspace(1) %addr |
| %zext = zext i8 %load to i32 |
| %to.vgpr = bitcast i32 %zext to float |
| ret float %to.vgpr |
| } |
| |
| ; -------------------------------------------------------------------------------- |
| ; Full 64-bit scalar add. |
| ; -------------------------------------------------------------------------------- |
| |
| define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { |
| ; GFX9-LABEL: global_addr_64bit_lsr_iv: |
| ; GFX9: ; %bb.0: ; %bb |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: .LBB132_1: ; %bb3 |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX9-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX9-NEXT: s_cbranch_scc0 .LBB132_1 |
| ; GFX9-NEXT: ; %bb.2: ; %bb2 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_addr_64bit_lsr_iv: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB132_1: ; %bb3 |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX10-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX10-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX10-NEXT: s_cbranch_scc0 .LBB132_1 |
| ; GFX10-NEXT: ; %bb.2: ; %bb2 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_addr_64bit_lsr_iv: |
| ; GFX11: ; %bb.0: ; %bb |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: .LBB132_1: ; %bb3 |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX11-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX11-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX11-NEXT: s_cbranch_scc0 .LBB132_1 |
| ; GFX11-NEXT: ; %bb.2: ; %bb2 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv: |
| ; GFX12-SDAG: ; %bb.0: ; %bb |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3 |
| ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 |
| ; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 |
| ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1 |
| ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv: |
| ; GFX12-GISEL: ; %bb.0: ; %bb |
| ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 |
| ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 |
| ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1 |
| ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| bb: |
| br label %bb3 |
| |
| bb2: ; preds = %bb3 |
| ret void |
| |
| bb3: ; preds = %bb3, %bb |
| %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] |
| %i4 = zext i32 %i to i64 |
| %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4 |
| %i6 = load volatile float, ptr addrspace(1) %i5, align 4 |
| %i8 = add nuw nsw i32 %i, 1 |
| %i9 = icmp eq i32 %i8, 256 |
| br i1 %i9, label %bb2, label %bb3 |
| } |
| |
| ; Make sure we only have a single zero vaddr initialization. |
| |
| define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) { |
| ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: |
| ; GFX9: ; %bb.0: ; %bb |
| ; GFX9-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: .LBB133_1: ; %bb3 |
| ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX9-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX9-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX9-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 |
| ; GFX9-NEXT: s_cbranch_scc0 .LBB133_1 |
| ; GFX9-NEXT: ; %bb.2: ; %bb2 |
| ; GFX9-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-NEXT: .LBB133_1: ; %bb3 |
| ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX10-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX10-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 |
| ; GFX10-NEXT: s_cbranch_scc0 .LBB133_1 |
| ; GFX10-NEXT: ; %bb.2: ; %bb2 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload: |
| ; GFX11: ; %bb.0: ; %bb |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: .LBB133_1: ; %bb3 |
| ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_add_u32 s4, s2, s0 |
| ; GFX11-NEXT: s_addc_u32 s5, s3, s1 |
| ; GFX11-NEXT: s_add_u32 s0, s0, 4 |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 |
| ; GFX11-NEXT: s_cbranch_scc0 .LBB133_1 |
| ; GFX11-NEXT: ; %bb.2: ; %bb2 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload: |
| ; GFX12-SDAG: ; %bb.0: ; %bb |
| ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3 |
| ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] |
| ; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 |
| ; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 |
| ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1 |
| ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 |
| ; GFX12-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload: |
| ; GFX12-GISEL: ; %bb.0: ; %bb |
| ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 |
| ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc |
| ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 |
| ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd |
| ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS |
| ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 |
| ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1 |
| ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 |
| ; GFX12-GISEL-NEXT: s_endpgm |
| bb: |
| br label %bb3 |
| |
| bb2: ; preds = %bb3 |
| ret void |
| |
| bb3: ; preds = %bb3, %bb |
| %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] |
| %i4 = zext i32 %i to i64 |
| %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4 |
| %i6 = load volatile float, ptr addrspace(1) %i5, align 4 |
| %i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4 |
| %i6.1 = load volatile float, ptr addrspace(1) %i5, align 4 |
| %i8 = add nuw nsw i32 %i, 1 |
| %i9 = icmp eq i32 %i8, 256 |
| br i1 %i9, label %bb2, label %bb3 |
| } |
| |
| !0 = !{i32 0, i32 1073741824} ; (1 << 30) |
| !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 |