blob: 4361e5c113708b34d0a271060578c7951659f0a4 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11,GFX11-True16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-NoTrue16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s
; global address space, addrspace(1)
; gfx12 true 16, not natural alignment or not uniform mmo
define amdgpu_ps void @load_uniform_P1_i16_b16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_i16_b16_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s4, s2
; GFX7-NEXT: s_mov_b32 s5, s3
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx12:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: s_clause 0x1
; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] scope:SCOPE_SYS
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: s_clause 0x1
; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra, align 1
%b = load volatile i16, ptr addrspace(1) %ptrb
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx11 true16, 16-bit load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_i16_b16_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_i16_b16_gfx11:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx11:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra
%b = load volatile i16, ptr addrspace(1) %ptra, align 4
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx12 without true16, 16-bit any-extending load, not natural alignment or not uniform mmo
define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: s_clause 0x1
; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: s_clause 0x1
; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra, align 1
%b = load volatile i16, ptr addrspace(1) %ptra
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx11(or older) without true 16, s16 any-extending load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx11:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %ptra
%b = load volatile i16, ptr addrspace(1) %ptra, align 4
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; any target, 32-bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v3, v2, s[0:1]
; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b32 v3, v2, s[0:1]
; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %ptra, align 2
%b = load volatile i32, ptr addrspace(1) %ptra
%sum = add i32 %a, %b
store i32 %sum, ptr addrspace(1) %out
ret void
}
; any target, 64bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_v2i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: v_readfirstlane_b32 s5, v5
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v4
; GFX7-NEXT: s_add_i32 s1, s1, s5
; GFX7-NEXT: s_add_i32 s0, s0, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1]
; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
; GFX11-NEXT: s_add_i32 s1, s1, s3
; GFX11-NEXT: s_add_i32 s0, s0, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1]
; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_add_co_i32 s0, s0, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %ptra, align 2
%b = load volatile <2 x i32>, ptr addrspace(1) %ptra
%sum = add <2 x i32> %a, %b
store <2 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 96bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_v3i32_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx3 v[5:7], off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v5
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: v_readfirstlane_b32 s6, v4
; GFX7-NEXT: v_readfirstlane_b32 s5, v6
; GFX7-NEXT: v_readfirstlane_b32 s7, v7
; GFX7-NEXT: s_add_i32 s4, s0, s4
; GFX7-NEXT: s_add_i32 s5, s1, s5
; GFX7-NEXT: s_add_i32 s6, s6, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_v3i32_gfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1]
; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
; GFX11-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
; GFX11-NEXT: v_readfirstlane_b32 s4, v6
; GFX11-NEXT: s_add_i32 s2, s2, s5
; GFX11-NEXT: s_add_i32 s0, s0, s3
; GFX11-NEXT: s_add_i32 s1, s1, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v3i32_gfx12:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v5, 0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1]
; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
; GFX12-NEXT: v_readfirstlane_b32 s5, v7
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
; GFX12-NEXT: v_readfirstlane_b32 s4, v6
; GFX12-NEXT: s_add_co_i32 s2, s2, s5
; GFX12-NEXT: s_add_co_i32 s0, s0, s3
; GFX12-NEXT: s_add_co_i32 s1, s1, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(1) %ptra, align 2
%b = load volatile <3 x i32>, ptr addrspace(1) %ptra
%sum = add <3 x i32> %a, %b
store <3 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 128-bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_v4i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v6
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: v_readfirstlane_b32 s6, v4
; GFX7-NEXT: v_readfirstlane_b32 s7, v5
; GFX7-NEXT: v_readfirstlane_b32 s5, v7
; GFX7-NEXT: v_readfirstlane_b32 s8, v8
; GFX7-NEXT: v_readfirstlane_b32 s9, v9
; GFX7-NEXT: s_add_i32 s4, s0, s4
; GFX7-NEXT: s_add_i32 s5, s1, s5
; GFX7-NEXT: s_add_i32 s6, s6, s8
; GFX7-NEXT: s_add_i32 s7, s7, s9
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1]
; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
; GFX11-NEXT: v_readfirstlane_b32 s7, v9
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
; GFX11-NEXT: v_readfirstlane_b32 s4, v6
; GFX11-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-NEXT: s_add_i32 s3, s3, s7
; GFX11-NEXT: s_add_i32 s0, s0, s4
; GFX11-NEXT: s_add_i32 s1, s1, s5
; GFX11-NEXT: s_add_i32 s2, s2, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v6, 0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1]
; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
; GFX12-NEXT: v_readfirstlane_b32 s7, v9
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
; GFX12-NEXT: v_readfirstlane_b32 s4, v6
; GFX12-NEXT: v_readfirstlane_b32 s5, v7
; GFX12-NEXT: v_readfirstlane_b32 s6, v8
; GFX12-NEXT: s_add_co_i32 s3, s3, s7
; GFX12-NEXT: s_add_co_i32 s0, s0, s4
; GFX12-NEXT: s_add_co_i32 s1, s1, s5
; GFX12-NEXT: s_add_co_i32 s2, s2, s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(1) %ptra, align 2
%b = load volatile <4 x i32>, ptr addrspace(1) %ptra
%sum = add <4 x i32> %a, %b
store <4 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 256bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_v8i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:16 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
; GFX7-NEXT: v_readfirstlane_b32 s5, v3
; GFX7-NEXT: v_readfirstlane_b32 s12, v10
; GFX7-NEXT: v_readfirstlane_b32 s6, v4
; GFX7-NEXT: v_readfirstlane_b32 s7, v5
; GFX7-NEXT: v_readfirstlane_b32 s8, v6
; GFX7-NEXT: v_readfirstlane_b32 s13, v11
; GFX7-NEXT: v_readfirstlane_b32 s14, v12
; GFX7-NEXT: v_readfirstlane_b32 s15, v13
; GFX7-NEXT: v_readfirstlane_b32 s16, v14
; GFX7-NEXT: s_add_i32 s4, s4, s12
; GFX7-NEXT: v_readfirstlane_b32 s9, v7
; GFX7-NEXT: v_readfirstlane_b32 s10, v8
; GFX7-NEXT: v_readfirstlane_b32 s11, v9
; GFX7-NEXT: v_readfirstlane_b32 s17, v15
; GFX7-NEXT: v_readfirstlane_b32 s18, v16
; GFX7-NEXT: v_readfirstlane_b32 s19, v17
; GFX7-NEXT: s_add_i32 s5, s5, s13
; GFX7-NEXT: s_add_i32 s6, s6, s14
; GFX7-NEXT: s_add_i32 s7, s7, s15
; GFX7-NEXT: s_add_i32 s8, s8, s16
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_add_i32 s9, s9, s17
; GFX7-NEXT: s_add_i32 s10, s10, s18
; GFX7-NEXT: s_add_i32 s11, s11, s19
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_mov_b32_e32 v6, s8
; GFX7-NEXT: v_mov_b32_e32 v7, s9
; GFX7-NEXT: v_mov_b32_e32 v8, s10
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v14, 0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1]
; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: v_readfirstlane_b32 s11, v13
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
; GFX11-NEXT: v_readfirstlane_b32 s7, v9
; GFX11-NEXT: v_readfirstlane_b32 s8, v10
; GFX11-NEXT: v_readfirstlane_b32 s9, v11
; GFX11-NEXT: v_readfirstlane_b32 s10, v12
; GFX11-NEXT: v_readfirstlane_b32 s15, v17
; GFX11-NEXT: v_readfirstlane_b32 s4, v6
; GFX11-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-NEXT: v_readfirstlane_b32 s12, v14
; GFX11-NEXT: v_readfirstlane_b32 s13, v15
; GFX11-NEXT: v_readfirstlane_b32 s14, v16
; GFX11-NEXT: s_add_i32 s3, s3, s11
; GFX11-NEXT: s_add_i32 s0, s0, s8
; GFX11-NEXT: s_add_i32 s1, s1, s9
; GFX11-NEXT: s_add_i32 s2, s2, s10
; GFX11-NEXT: s_add_i32 s7, s7, s15
; GFX11-NEXT: s_add_i32 s4, s4, s12
; GFX11-NEXT: s_add_i32 s5, s5, s13
; GFX11-NEXT: s_add_i32 s6, s6, s14
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v8i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v14, 0
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1]
; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: v_readfirstlane_b32 s11, v13
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
; GFX12-NEXT: v_readfirstlane_b32 s7, v9
; GFX12-NEXT: v_readfirstlane_b32 s8, v10
; GFX12-NEXT: v_readfirstlane_b32 s9, v11
; GFX12-NEXT: v_readfirstlane_b32 s10, v12
; GFX12-NEXT: v_readfirstlane_b32 s15, v17
; GFX12-NEXT: v_readfirstlane_b32 s4, v6
; GFX12-NEXT: v_readfirstlane_b32 s5, v7
; GFX12-NEXT: v_readfirstlane_b32 s6, v8
; GFX12-NEXT: v_readfirstlane_b32 s12, v14
; GFX12-NEXT: v_readfirstlane_b32 s13, v15
; GFX12-NEXT: v_readfirstlane_b32 s14, v16
; GFX12-NEXT: s_add_co_i32 s3, s3, s11
; GFX12-NEXT: s_add_co_i32 s0, s0, s8
; GFX12-NEXT: s_add_co_i32 s1, s1, s9
; GFX12-NEXT: s_add_co_i32 s2, s2, s10
; GFX12-NEXT: s_add_co_i32 s7, s7, s15
; GFX12-NEXT: s_add_co_i32 s4, s4, s12
; GFX12-NEXT: s_add_co_i32 s5, s5, s13
; GFX12-NEXT: s_add_co_i32 s6, s6, s14
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: s_endpgm
%a = load <8 x i32>, ptr addrspace(1) %ptra, align 2
%b = load volatile <8 x i32>, ptr addrspace(1) %ptra
%sum = add <8 x i32> %a, %b
store <8 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 512bit load load, not align 4 or not uniform mmo
define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P1_v16i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32
; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
; GFX7-NEXT: v_readfirstlane_b32 s5, v3
; GFX7-NEXT: v_readfirstlane_b32 s6, v4
; GFX7-NEXT: v_readfirstlane_b32 s7, v5
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s8, v6
; GFX7-NEXT: v_readfirstlane_b32 s9, v7
; GFX7-NEXT: v_readfirstlane_b32 s10, v8
; GFX7-NEXT: v_readfirstlane_b32 s11, v9
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s12, v10
; GFX7-NEXT: v_readfirstlane_b32 s13, v11
; GFX7-NEXT: v_readfirstlane_b32 s14, v12
; GFX7-NEXT: v_readfirstlane_b32 s15, v13
; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s16, v14
; GFX7-NEXT: v_readfirstlane_b32 s17, v15
; GFX7-NEXT: v_readfirstlane_b32 s18, v16
; GFX7-NEXT: v_readfirstlane_b32 s19, v17
; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: v_readfirstlane_b32 s20, v2
; GFX7-NEXT: v_readfirstlane_b32 s21, v3
; GFX7-NEXT: v_readfirstlane_b32 s22, v4
; GFX7-NEXT: v_readfirstlane_b32 s23, v5
; GFX7-NEXT: s_add_i32 s4, s4, s20
; GFX7-NEXT: v_readfirstlane_b32 s24, v6
; GFX7-NEXT: v_readfirstlane_b32 s25, v7
; GFX7-NEXT: v_readfirstlane_b32 s26, v8
; GFX7-NEXT: v_readfirstlane_b32 s27, v9
; GFX7-NEXT: s_add_i32 s5, s5, s21
; GFX7-NEXT: v_readfirstlane_b32 s28, v10
; GFX7-NEXT: v_readfirstlane_b32 s29, v11
; GFX7-NEXT: v_readfirstlane_b32 s30, v12
; GFX7-NEXT: v_readfirstlane_b32 s31, v13
; GFX7-NEXT: s_add_i32 s6, s6, s22
; GFX7-NEXT: v_readfirstlane_b32 s33, v14
; GFX7-NEXT: v_readfirstlane_b32 s34, v15
; GFX7-NEXT: v_readfirstlane_b32 s35, v16
; GFX7-NEXT: v_readfirstlane_b32 s36, v17
; GFX7-NEXT: s_add_i32 s7, s7, s23
; GFX7-NEXT: s_add_i32 s8, s8, s24
; GFX7-NEXT: s_add_i32 s12, s12, s28
; GFX7-NEXT: s_add_i32 s16, s16, s33
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_add_i32 s9, s9, s25
; GFX7-NEXT: s_add_i32 s10, s10, s26
; GFX7-NEXT: s_add_i32 s11, s11, s27
; GFX7-NEXT: s_add_i32 s13, s13, s29
; GFX7-NEXT: s_add_i32 s14, s14, s30
; GFX7-NEXT: s_add_i32 s15, s15, s31
; GFX7-NEXT: s_add_i32 s17, s17, s34
; GFX7-NEXT: s_add_i32 s18, s18, s35
; GFX7-NEXT: s_add_i32 s19, s19, s36
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_mov_b32_e32 v6, s8
; GFX7-NEXT: v_mov_b32_e32 v10, s12
; GFX7-NEXT: v_mov_b32_e32 v14, s16
; GFX7-NEXT: v_mov_b32_e32 v7, s9
; GFX7-NEXT: v_mov_b32_e32 v8, s10
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: v_mov_b32_e32 v11, s13
; GFX7-NEXT: v_mov_b32_e32 v12, s14
; GFX7-NEXT: v_mov_b32_e32 v13, s15
; GFX7-NEXT: v_mov_b32_e32 v15, s17
; GFX7-NEXT: v_mov_b32_e32 v16, s18
; GFX7-NEXT: v_mov_b32_e32 v17, s19
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P1_v16i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v30, 0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_load_b128 v[2:5], v30, s[0:1]
; GFX11-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48
; GFX11-NEXT: global_load_b128 v[18:21], v30, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v5
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
; GFX11-NEXT: v_readfirstlane_b32 s2, v4
; GFX11-NEXT: v_readfirstlane_b32 s19, v21
; GFX11-NEXT: v_readfirstlane_b32 s7, v9
; GFX11-NEXT: v_readfirstlane_b32 s16, v18
; GFX11-NEXT: v_readfirstlane_b32 s17, v19
; GFX11-NEXT: v_readfirstlane_b32 s18, v20
; GFX11-NEXT: v_readfirstlane_b32 s23, v25
; GFX11-NEXT: v_readfirstlane_b32 s4, v6
; GFX11-NEXT: v_readfirstlane_b32 s5, v7
; GFX11-NEXT: v_readfirstlane_b32 s6, v8
; GFX11-NEXT: v_readfirstlane_b32 s11, v13
; GFX11-NEXT: v_readfirstlane_b32 s20, v22
; GFX11-NEXT: v_readfirstlane_b32 s21, v23
; GFX11-NEXT: v_readfirstlane_b32 s22, v24
; GFX11-NEXT: v_readfirstlane_b32 s27, v29
; GFX11-NEXT: v_readfirstlane_b32 s8, v10
; GFX11-NEXT: v_readfirstlane_b32 s9, v11
; GFX11-NEXT: v_readfirstlane_b32 s10, v12
; GFX11-NEXT: v_readfirstlane_b32 s15, v17
; GFX11-NEXT: v_readfirstlane_b32 s24, v26
; GFX11-NEXT: v_readfirstlane_b32 s25, v27
; GFX11-NEXT: v_readfirstlane_b32 s26, v28
; GFX11-NEXT: v_readfirstlane_b32 s31, v33
; GFX11-NEXT: v_readfirstlane_b32 s12, v14
; GFX11-NEXT: v_readfirstlane_b32 s13, v15
; GFX11-NEXT: v_readfirstlane_b32 s14, v16
; GFX11-NEXT: v_readfirstlane_b32 s28, v30
; GFX11-NEXT: v_readfirstlane_b32 s29, v31
; GFX11-NEXT: v_readfirstlane_b32 s30, v32
; GFX11-NEXT: s_add_i32 s3, s3, s19
; GFX11-NEXT: s_add_i32 s0, s0, s16
; GFX11-NEXT: s_add_i32 s1, s1, s17
; GFX11-NEXT: s_add_i32 s2, s2, s18
; GFX11-NEXT: s_add_i32 s7, s7, s23
; GFX11-NEXT: s_add_i32 s4, s4, s20
; GFX11-NEXT: s_add_i32 s5, s5, s21
; GFX11-NEXT: s_add_i32 s6, s6, s22
; GFX11-NEXT: s_add_i32 s11, s11, s27
; GFX11-NEXT: v_mov_b32_e32 v5, s3
; GFX11-NEXT: s_add_i32 s8, s8, s24
; GFX11-NEXT: s_add_i32 s9, s9, s25
; GFX11-NEXT: s_add_i32 s10, s10, s26
; GFX11-NEXT: s_add_i32 s15, s15, s31
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
; GFX11-NEXT: s_add_i32 s12, s12, s28
; GFX11-NEXT: s_add_i32 s13, s13, s29
; GFX11-NEXT: s_add_i32 s14, s14, s30
; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
; GFX11-NEXT: v_mov_b32_e32 v14, s12
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P1_v16i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v30, 0
; GFX12-NEXT: s_clause 0x4
; GFX12-NEXT: global_load_b128 v[2:5], v30, s[0:1]
; GFX12-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16
; GFX12-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32
; GFX12-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48
; GFX12-NEXT: global_load_b128 v[18:21], v30, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s3, v5
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: v_readfirstlane_b32 s2, v4
; GFX12-NEXT: v_readfirstlane_b32 s19, v21
; GFX12-NEXT: v_readfirstlane_b32 s7, v9
; GFX12-NEXT: v_readfirstlane_b32 s16, v18
; GFX12-NEXT: v_readfirstlane_b32 s17, v19
; GFX12-NEXT: v_readfirstlane_b32 s18, v20
; GFX12-NEXT: v_readfirstlane_b32 s23, v25
; GFX12-NEXT: v_readfirstlane_b32 s4, v6
; GFX12-NEXT: v_readfirstlane_b32 s5, v7
; GFX12-NEXT: v_readfirstlane_b32 s6, v8
; GFX12-NEXT: v_readfirstlane_b32 s11, v13
; GFX12-NEXT: v_readfirstlane_b32 s20, v22
; GFX12-NEXT: v_readfirstlane_b32 s21, v23
; GFX12-NEXT: v_readfirstlane_b32 s22, v24
; GFX12-NEXT: v_readfirstlane_b32 s27, v29
; GFX12-NEXT: v_readfirstlane_b32 s8, v10
; GFX12-NEXT: v_readfirstlane_b32 s9, v11
; GFX12-NEXT: v_readfirstlane_b32 s10, v12
; GFX12-NEXT: v_readfirstlane_b32 s15, v17
; GFX12-NEXT: v_readfirstlane_b32 s24, v26
; GFX12-NEXT: v_readfirstlane_b32 s25, v27
; GFX12-NEXT: v_readfirstlane_b32 s26, v28
; GFX12-NEXT: v_readfirstlane_b32 s31, v33
; GFX12-NEXT: v_readfirstlane_b32 s12, v14
; GFX12-NEXT: v_readfirstlane_b32 s13, v15
; GFX12-NEXT: v_readfirstlane_b32 s14, v16
; GFX12-NEXT: v_readfirstlane_b32 s28, v30
; GFX12-NEXT: v_readfirstlane_b32 s29, v31
; GFX12-NEXT: v_readfirstlane_b32 s30, v32
; GFX12-NEXT: s_add_co_i32 s3, s3, s19
; GFX12-NEXT: s_add_co_i32 s0, s0, s16
; GFX12-NEXT: s_add_co_i32 s1, s1, s17
; GFX12-NEXT: s_add_co_i32 s2, s2, s18
; GFX12-NEXT: s_add_co_i32 s7, s7, s23
; GFX12-NEXT: s_add_co_i32 s4, s4, s20
; GFX12-NEXT: s_add_co_i32 s5, s5, s21
; GFX12-NEXT: s_add_co_i32 s6, s6, s22
; GFX12-NEXT: s_add_co_i32 s11, s11, s27
; GFX12-NEXT: v_mov_b32_e32 v5, s3
; GFX12-NEXT: s_add_co_i32 s8, s8, s24
; GFX12-NEXT: s_add_co_i32 s9, s9, s25
; GFX12-NEXT: s_add_co_i32 s10, s10, s26
; GFX12-NEXT: s_add_co_i32 s15, s15, s31
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
; GFX12-NEXT: s_add_co_i32 s12, s12, s28
; GFX12-NEXT: s_add_co_i32 s13, s13, s29
; GFX12-NEXT: s_add_co_i32 s14, s14, s30
; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
; GFX12-NEXT: v_mov_b32_e32 v14, s12
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX12-NEXT: s_endpgm
%a = load <16 x i32>, ptr addrspace(1) %ptra, align 2
%b = load volatile <16 x i32>, ptr addrspace(1) %ptra
%sum = add <16 x i32> %a, %b
store <16 x i32> %sum, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_i8_any_extending:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_u8 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b8 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_divergent_P3_i8_any_extending:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: ds_load_u8 v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_store_b8 v0, v1
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_divergent_P3_i8_any_extending:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_load_u8 v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: ds_store_b8 v0, v1
; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(3) %ptra
store i8 %a, ptr addrspace(3) %out
ret void
}
; with true16, S16 16-bit load
; without true16, S32 16-bit any-extending load
define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_u16 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b16 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_divergent_P3_i16:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0
; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1
; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11-True16-NEXT: ds_store_b16 v0, v1
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_divergent_P3_i16:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NoTrue16-NEXT: ds_load_u16 v1, v1
; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NoTrue16-NEXT: ds_store_b16 v0, v1
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_divergent_P3_i16:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0
; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1
; GFX12-True16-NEXT: s_wait_dscnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-True16-NEXT: s_wait_alu 0xf1ff
; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX12-True16-NEXT: ds_store_b16 v0, v1
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_divergent_P3_i16:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NoTrue16-NEXT: ds_load_u16 v1, v1
; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0
; GFX12-NoTrue16-NEXT: ds_store_b16 v0, v1
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(3) %ptra
store i16 %a, ptr addrspace(3) %out
ret void
}
define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_divergent_P3_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: ds_load_b32 v1, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_store_b32 v0, v1
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_divergent_P3_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_load_b32 v1, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: ds_store_b32 v0, v1
; GFX12-NEXT: s_endpgm
%a = load i32, ptr addrspace(3) %ptra
store i32 %a, ptr addrspace(3) %out
ret void
}
define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_v2i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b64 v[1:2], v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b64 v0, v[1:2]
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_divergent_P3_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: ds_load_b64 v[1:2], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_store_b64 v0, v[1:2]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_divergent_P3_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_load_b64 v[1:2], v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: ds_store_b64 v0, v[1:2]
; GFX12-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(3) %ptra
store <2 x i32> %a, ptr addrspace(3) %out
ret void
}
define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_v3i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b96 v[1:3], v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b96 v0, v[1:3]
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_divergent_P3_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: ds_load_b96 v[1:3], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_store_b96 v0, v[1:3]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_divergent_P3_v3i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_load_b96 v[1:3], v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: ds_store_b96 v0, v[1:3]
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(3) %ptra
store <3 x i32> %a, ptr addrspace(3) %out
ret void
}
define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) {
; GFX7-LABEL: load_divergent_P3_v4i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b128 v[1:4], v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b128 v0, v[1:4]
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_divergent_P3_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: ds_load_b128 v[1:4], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_store_b128 v0, v[1:4]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_divergent_P3_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: ds_load_b128 v[1:4], v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: ds_store_b128 v0, v[1:4]
; GFX12-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(3) %ptra
store <4 x i32> %a, ptr addrspace(3) %out
ret void
}
; constant address space, addrspace(4)
; not uniform load mmo check for G_LOAD is for the case where MMO somehow ends
; up with different addresspace then 4, Don't have tests for it in LLVM-IR.
; %b in tests will end up as uniform load in sgpr
; gfx12 true 16, not natural alignment
define amdgpu_ps void @load_uniform_P4_i16_b16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_i16_b16_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s4, s2
; GFX7-NEXT: s_mov_b32 s5, s3
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx12:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(1)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX12-True16-NEXT: s_load_u16 s0, s[2:3], 0x0
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[2:3], 0x0
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra, align 1
%b = load volatile i16, ptr addrspace(4) %ptrb
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx11 true16, 16-bit load, not align 4
define amdgpu_ps void @load_uniform_P4_i16_b16_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_i16_b16_gfx11:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx11:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-True16-NEXT: s_add_i32 s0, s1, s0
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_clause 0x1
; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_clause 0x1
; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra
%b = load volatile i16, ptr addrspace(4) %ptra, align 4
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx12 without true16, 16-bit any-extending load, not natural alignment
define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: s_clause 0x1
; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1]
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc
; GFX11-True16-NEXT: s_waitcnt vmcnt(1)
; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_add_i32 s0, s0, s1
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: s_clause 0x1
; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1]
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_loadcnt 0x0
; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0
; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra, align 1
%b = load volatile i16, ptr addrspace(4) %ptra
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; gfx11(or older) without true 16, s16 any-extending load, not align 4
define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx11:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
; GFX11-True16: ; %bb.0:
; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1]
; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-True16-NEXT: s_waitcnt vmcnt(0)
; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-True16-NEXT: s_add_i32 s0, s1, s0
; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-True16-NEXT: s_endpgm
;
; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
; GFX11-NoTrue16: ; %bb.0:
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1]
; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0)
; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0
; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NoTrue16-NEXT: s_endpgm
;
; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
; GFX12-True16: ; %bb.0:
; GFX12-True16-NEXT: s_clause 0x1
; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-True16-NEXT: s_wait_kmcnt 0x0
; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0
; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-True16-NEXT: s_endpgm
;
; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11:
; GFX12-NoTrue16: ; %bb.0:
; GFX12-NoTrue16-NEXT: s_clause 0x1
; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0
; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0
; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NoTrue16-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %ptra
%b = load volatile i16, ptr addrspace(4) %ptra, align 4
%sum = add i16 %a, %b
store i16 %sum, ptr addrspace(1) %out
ret void
}
; any target, 32-bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s1, s0
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_b32 v2, v2, s[0:1]
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_load_b32 v2, v2, s[0:1]
; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s1, v2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%a = load i32, ptr addrspace(4) %ptra, align 2
%b = load volatile i32, ptr addrspace(4) %ptra
%sum = add i32 %a, %b
store i32 %sum, ptr addrspace(1) %out
ret void
}
; any target, 64bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_v2i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s5, v3
; GFX7-NEXT: v_readfirstlane_b32 s4, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s1, s5, s1
; GFX7-NEXT: s_add_i32 s0, s4, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s0, s2, s0
; GFX11-NEXT: s_add_i32 s1, s3, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1]
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v2
; GFX12-NEXT: v_readfirstlane_b32 s3, v3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s2, s0
; GFX12-NEXT: s_add_co_i32 s1, s3, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(4) %ptra, align 2
%b = load volatile <2 x i32>, ptr addrspace(4) %ptra
%sum = add <2 x i32> %a, %b
store <2 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 96bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_v3i32_gfx12:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s7, v4
; GFX7-NEXT: s_add_i32 s4, s0, s4
; GFX7-NEXT: s_add_i32 s5, s1, s5
; GFX7-NEXT: s_add_i32 s6, s7, s6
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_v3i32_gfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1]
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s5, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v2
; GFX11-NEXT: v_readfirstlane_b32 s4, v3
; GFX11-NEXT: s_add_i32 s2, s5, s2
; GFX11-NEXT: s_add_i32 s0, s3, s0
; GFX11-NEXT: s_add_i32 s1, s4, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v3i32_gfx12:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1]
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s5, v4
; GFX12-NEXT: v_readfirstlane_b32 s3, v2
; GFX12-NEXT: v_readfirstlane_b32 s4, v3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s5, s2
; GFX12-NEXT: s_add_co_i32 s0, s3, s0
; GFX12-NEXT: s_add_co_i32 s1, s4, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX12-NEXT: s_endpgm
%a = load <3 x i32>, ptr addrspace(4) %ptra, align 2
%b = load volatile <3 x i32>, ptr addrspace(4) %ptra
%sum = add <3 x i32> %a, %b
store <3 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 128-bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_v4i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v3
; GFX7-NEXT: v_readfirstlane_b32 s8, v4
; GFX7-NEXT: v_readfirstlane_b32 s9, v5
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s4, s0, s4
; GFX7-NEXT: s_add_i32 s5, s1, s5
; GFX7-NEXT: s_add_i32 s6, s8, s6
; GFX7-NEXT: s_add_i32 s7, s9, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1]
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s7, v5
; GFX11-NEXT: v_readfirstlane_b32 s4, v2
; GFX11-NEXT: v_readfirstlane_b32 s5, v3
; GFX11-NEXT: v_readfirstlane_b32 s6, v4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s7, s3
; GFX11-NEXT: s_add_i32 s0, s4, s0
; GFX11-NEXT: s_add_i32 s1, s5, s1
; GFX11-NEXT: s_add_i32 s2, s6, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1]
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s7, v5
; GFX12-NEXT: v_readfirstlane_b32 s4, v2
; GFX12-NEXT: v_readfirstlane_b32 s5, v3
; GFX12-NEXT: v_readfirstlane_b32 s6, v4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s3, s7, s3
; GFX12-NEXT: s_add_co_i32 s0, s4, s0
; GFX12-NEXT: s_add_co_i32 s1, s5, s1
; GFX12-NEXT: s_add_co_i32 s2, s6, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(4) %ptra, align 2
%b = load volatile <4 x i32>, ptr addrspace(4) %ptra
%sum = add <4 x i32> %a, %b
store <4 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 256bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_v8i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_readfirstlane_b32 s12, v2
; GFX7-NEXT: v_readfirstlane_b32 s13, v3
; GFX7-NEXT: v_readfirstlane_b32 s14, v4
; GFX7-NEXT: v_readfirstlane_b32 s15, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s16, v6
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s4, s12, s4
; GFX7-NEXT: v_readfirstlane_b32 s17, v7
; GFX7-NEXT: v_readfirstlane_b32 s18, v8
; GFX7-NEXT: v_readfirstlane_b32 s19, v9
; GFX7-NEXT: s_add_i32 s5, s13, s5
; GFX7-NEXT: s_add_i32 s6, s14, s6
; GFX7-NEXT: s_add_i32 s7, s15, s7
; GFX7-NEXT: s_add_i32 s8, s16, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_add_i32 s9, s17, s9
; GFX7-NEXT: s_add_i32 s10, s18, s10
; GFX7-NEXT: s_add_i32 s11, s19, s11
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_mov_b32_e32 v6, s8
; GFX7-NEXT: v_mov_b32_e32 v7, s9
; GFX7-NEXT: v_mov_b32_e32 v8, s10
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_v8i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1]
; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_readfirstlane_b32 s11, v5
; GFX11-NEXT: v_readfirstlane_b32 s8, v2
; GFX11-NEXT: v_readfirstlane_b32 s9, v3
; GFX11-NEXT: v_readfirstlane_b32 s10, v4
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s15, v9
; GFX11-NEXT: v_readfirstlane_b32 s12, v6
; GFX11-NEXT: v_readfirstlane_b32 s13, v7
; GFX11-NEXT: v_readfirstlane_b32 s14, v8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s11, s3
; GFX11-NEXT: s_add_i32 s0, s8, s0
; GFX11-NEXT: s_add_i32 s1, s9, s1
; GFX11-NEXT: s_add_i32 s2, s10, s2
; GFX11-NEXT: s_add_i32 s7, s15, s7
; GFX11-NEXT: s_add_i32 s4, s12, s4
; GFX11-NEXT: s_add_i32 s5, s13, s5
; GFX11-NEXT: s_add_i32 s6, s14, s6
; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v8i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v6, 0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1]
; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16
; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x1
; GFX12-NEXT: v_readfirstlane_b32 s11, v5
; GFX12-NEXT: v_readfirstlane_b32 s8, v2
; GFX12-NEXT: v_readfirstlane_b32 s9, v3
; GFX12-NEXT: v_readfirstlane_b32 s10, v4
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s15, v9
; GFX12-NEXT: v_readfirstlane_b32 s12, v6
; GFX12-NEXT: v_readfirstlane_b32 s13, v7
; GFX12-NEXT: v_readfirstlane_b32 s14, v8
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s3, s11, s3
; GFX12-NEXT: s_add_co_i32 s0, s8, s0
; GFX12-NEXT: s_add_co_i32 s1, s9, s1
; GFX12-NEXT: s_add_co_i32 s2, s10, s2
; GFX12-NEXT: s_add_co_i32 s7, s15, s7
; GFX12-NEXT: s_add_co_i32 s4, s12, s4
; GFX12-NEXT: s_add_co_i32 s5, s13, s5
; GFX12-NEXT: s_add_co_i32 s6, s14, s6
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6
; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: s_endpgm
%a = load <8 x i32>, ptr addrspace(4) %ptra, align 2
%b = load volatile <8 x i32>, ptr addrspace(4) %ptra
%sum = add <8 x i32> %a, %b
store <8 x i32> %sum, ptr addrspace(1) %out
ret void
}
; any target, 512bit load load, not align 4
define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) {
; GFX7-LABEL: load_uniform_P4_v16i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16
; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32
; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48
; GFX7-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_readfirstlane_b32 s20, v2
; GFX7-NEXT: v_readfirstlane_b32 s21, v3
; GFX7-NEXT: v_readfirstlane_b32 s22, v4
; GFX7-NEXT: v_readfirstlane_b32 s23, v5
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_readfirstlane_b32 s24, v6
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_readfirstlane_b32 s28, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s33, v14
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s4, s20, s4
; GFX7-NEXT: v_readfirstlane_b32 s25, v7
; GFX7-NEXT: v_readfirstlane_b32 s26, v8
; GFX7-NEXT: v_readfirstlane_b32 s27, v9
; GFX7-NEXT: v_readfirstlane_b32 s29, v11
; GFX7-NEXT: v_readfirstlane_b32 s30, v12
; GFX7-NEXT: v_readfirstlane_b32 s31, v13
; GFX7-NEXT: v_readfirstlane_b32 s34, v15
; GFX7-NEXT: v_readfirstlane_b32 s35, v16
; GFX7-NEXT: v_readfirstlane_b32 s36, v17
; GFX7-NEXT: s_add_i32 s5, s21, s5
; GFX7-NEXT: s_add_i32 s6, s22, s6
; GFX7-NEXT: s_add_i32 s7, s23, s7
; GFX7-NEXT: s_add_i32 s8, s24, s8
; GFX7-NEXT: s_add_i32 s12, s28, s12
; GFX7-NEXT: s_add_i32 s16, s33, s16
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_add_i32 s9, s25, s9
; GFX7-NEXT: s_add_i32 s10, s26, s10
; GFX7-NEXT: s_add_i32 s11, s27, s11
; GFX7-NEXT: s_add_i32 s13, s29, s13
; GFX7-NEXT: s_add_i32 s14, s30, s14
; GFX7-NEXT: s_add_i32 s15, s31, s15
; GFX7-NEXT: s_add_i32 s17, s34, s17
; GFX7-NEXT: s_add_i32 s18, s35, s18
; GFX7-NEXT: s_add_i32 s19, s36, s19
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_mov_b32_e32 v6, s8
; GFX7-NEXT: v_mov_b32_e32 v10, s12
; GFX7-NEXT: v_mov_b32_e32 v14, s16
; GFX7-NEXT: v_mov_b32_e32 v7, s9
; GFX7-NEXT: v_mov_b32_e32 v8, s10
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: v_mov_b32_e32 v11, s13
; GFX7-NEXT: v_mov_b32_e32 v12, s14
; GFX7-NEXT: v_mov_b32_e32 v13, s15
; GFX7-NEXT: v_mov_b32_e32 v15, s17
; GFX7-NEXT: v_mov_b32_e32 v16, s18
; GFX7-NEXT: v_mov_b32_e32 v17, s19
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48
; GFX7-NEXT: s_endpgm
;
; GFX11-LABEL: load_uniform_P4_v16i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v14, 0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1]
; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48
; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_readfirstlane_b32 s19, v5
; GFX11-NEXT: v_readfirstlane_b32 s16, v2
; GFX11-NEXT: v_readfirstlane_b32 s17, v3
; GFX11-NEXT: v_readfirstlane_b32 s18, v4
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_readfirstlane_b32 s23, v9
; GFX11-NEXT: v_readfirstlane_b32 s20, v6
; GFX11-NEXT: v_readfirstlane_b32 s21, v7
; GFX11-NEXT: v_readfirstlane_b32 s22, v8
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_readfirstlane_b32 s27, v13
; GFX11-NEXT: v_readfirstlane_b32 s24, v10
; GFX11-NEXT: v_readfirstlane_b32 s25, v11
; GFX11-NEXT: v_readfirstlane_b32 s26, v12
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s31, v17
; GFX11-NEXT: v_readfirstlane_b32 s28, v14
; GFX11-NEXT: v_readfirstlane_b32 s29, v15
; GFX11-NEXT: v_readfirstlane_b32 s30, v16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s19, s3
; GFX11-NEXT: s_add_i32 s0, s16, s0
; GFX11-NEXT: s_add_i32 s1, s17, s1
; GFX11-NEXT: s_add_i32 s2, s18, s2
; GFX11-NEXT: s_add_i32 s7, s23, s7
; GFX11-NEXT: s_add_i32 s4, s20, s4
; GFX11-NEXT: s_add_i32 s5, s21, s5
; GFX11-NEXT: s_add_i32 s6, s22, s6
; GFX11-NEXT: s_add_i32 s11, s27, s11
; GFX11-NEXT: v_mov_b32_e32 v5, s3
; GFX11-NEXT: s_add_i32 s8, s24, s8
; GFX11-NEXT: s_add_i32 s9, s25, s9
; GFX11-NEXT: s_add_i32 s10, s26, s10
; GFX11-NEXT: s_add_i32 s15, s31, s15
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
; GFX11-NEXT: s_add_i32 s12, s28, s12
; GFX11-NEXT: s_add_i32 s13, s29, s13
; GFX11-NEXT: s_add_i32 s14, s30, s14
; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
; GFX11-NEXT: v_mov_b32_e32 v14, s12
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: load_uniform_P4_v16i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v14, 0
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1]
; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16
; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32
; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48
; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0
; GFX12-NEXT: s_wait_loadcnt 0x3
; GFX12-NEXT: v_readfirstlane_b32 s19, v5
; GFX12-NEXT: v_readfirstlane_b32 s16, v2
; GFX12-NEXT: v_readfirstlane_b32 s17, v3
; GFX12-NEXT: v_readfirstlane_b32 s18, v4
; GFX12-NEXT: s_wait_loadcnt 0x2
; GFX12-NEXT: v_readfirstlane_b32 s23, v9
; GFX12-NEXT: v_readfirstlane_b32 s20, v6
; GFX12-NEXT: v_readfirstlane_b32 s21, v7
; GFX12-NEXT: v_readfirstlane_b32 s22, v8
; GFX12-NEXT: s_wait_loadcnt 0x1
; GFX12-NEXT: v_readfirstlane_b32 s27, v13
; GFX12-NEXT: v_readfirstlane_b32 s24, v10
; GFX12-NEXT: v_readfirstlane_b32 s25, v11
; GFX12-NEXT: v_readfirstlane_b32 s26, v12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s31, v17
; GFX12-NEXT: v_readfirstlane_b32 s28, v14
; GFX12-NEXT: v_readfirstlane_b32 s29, v15
; GFX12-NEXT: v_readfirstlane_b32 s30, v16
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s3, s19, s3
; GFX12-NEXT: s_add_co_i32 s0, s16, s0
; GFX12-NEXT: s_add_co_i32 s1, s17, s1
; GFX12-NEXT: s_add_co_i32 s2, s18, s2
; GFX12-NEXT: s_add_co_i32 s7, s23, s7
; GFX12-NEXT: s_add_co_i32 s4, s20, s4
; GFX12-NEXT: s_add_co_i32 s5, s21, s5
; GFX12-NEXT: s_add_co_i32 s6, s22, s6
; GFX12-NEXT: s_add_co_i32 s11, s27, s11
; GFX12-NEXT: v_mov_b32_e32 v5, s3
; GFX12-NEXT: s_add_co_i32 s8, s24, s8
; GFX12-NEXT: s_add_co_i32 s9, s25, s9
; GFX12-NEXT: s_add_co_i32 s10, s26, s10
; GFX12-NEXT: s_add_co_i32 s15, s31, s15
; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7
; GFX12-NEXT: s_add_co_i32 s12, s28, s12
; GFX12-NEXT: s_add_co_i32 s13, s29, s13
; GFX12-NEXT: s_add_co_i32 s14, s30, s14
; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5
; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11
; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9
; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15
; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13
; GFX12-NEXT: v_mov_b32_e32 v14, s12
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX12-NEXT: s_endpgm
%a = load <16 x i32>, ptr addrspace(4) %ptra, align 2
%b = load volatile <16 x i32>, ptr addrspace(4) %ptra
%sum = add <16 x i32> %a, %b
store <16 x i32> %sum, ptr addrspace(1) %out
ret void
}