| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX7 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefixes=GFX11,GFX11-True16 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-NoTrue16 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-True16 %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-NoTrue16 %s |
| |
| ; global address space, addrspace(1) |
| |
| ; gfx12 true 16, not natural alignment or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_i16_b16_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_i16_b16_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s4, s2 |
| ; GFX7-NEXT: s_mov_b32 s5, s3 |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx12: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: s_clause 0x1 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] scope:SCOPE_SYS |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra, align 1 |
| %b = load volatile i16, ptr addrspace(1) %ptrb |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11 true16, 16-bit load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_i16_b16_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_i16_b16_gfx11: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P1_i16_b16_gfx11: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_b16_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_b16_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra |
| %b = load volatile i16, ptr addrspace(1) %ptra, align 4 |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx12 without true16, 16-bit any-extending load, not natural alignment or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: s_clause 0x1 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra, align 1 |
| %b = load volatile i16, ptr addrspace(1) %ptra |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11(or older) without true 16, s16 any-extending load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_i16_anyextending_gfx11(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_i16_anyextending_gfx11: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P1_i16_anyextending_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P1_i16_anyextending_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(1) %ptra |
| %b = load volatile i16, ptr addrspace(1) %ptra, align 4 |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 32-bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b32 v3, v2, s[0:1] |
| ; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_load_b32 v3, v2, s[0:1] |
| ; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s1 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i32, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile i32, ptr addrspace(1) %ptra |
| %sum = add i32 %a, %b |
| store i32 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 64bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_v2i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v5 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v4 |
| ; GFX7-NEXT: s_add_i32 s1, s1, s5 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s4 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] |
| ; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX11-NEXT: s_add_i32 s1, s1, s3 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1] |
| ; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s3 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s2 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile <2 x i32>, ptr addrspace(1) %ptra |
| %sum = add <2 x i32> %a, %b |
| store <2 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 96bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_v3i32_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx3 v[5:7], off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v5 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v7 |
| ; GFX7-NEXT: s_add_i32 s4, s0, s4 |
| ; GFX7-NEXT: s_add_i32 s5, s1, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s6, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_v3i32_gfx12: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v5, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1] |
| ; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX11-NEXT: s_add_i32 s2, s2, s5 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s3 |
| ; GFX11-NEXT: s_add_i32 s1, s1, s4 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v3i32_gfx12: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v5, 0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1] |
| ; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX12-NEXT: s_add_co_i32 s2, s2, s5 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s3 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile <3 x i32>, ptr addrspace(1) %ptra |
| %sum = add <3 x i32> %a, %b |
| store <3 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 128-bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_v4i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s8, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s9, v9 |
| ; GFX7-NEXT: s_add_i32 s4, s0, s4 |
| ; GFX7-NEXT: s_add_i32 s5, s1, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s6, s8 |
| ; GFX7-NEXT: s_add_i32 s7, s7, s9 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v6, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] |
| ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s7 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s4 |
| ; GFX11-NEXT: s_add_i32 s1, s1, s5 |
| ; GFX11-NEXT: s_add_i32 s2, s2, s6 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v6, 0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] |
| ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX12-NEXT: s_add_co_i32 s3, s3, s7 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s4 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 |
| ; GFX12-NEXT: s_add_co_i32 s2, s2, s6 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile <4 x i32>, ptr addrspace(1) %ptra |
| %sum = add <4 x i32> %a, %b |
| store <4 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 256bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_v8i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:16 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s12, v10 |
| ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX7-NEXT: v_readfirstlane_b32 s8, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 |
| ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 |
| ; GFX7-NEXT: v_readfirstlane_b32 s16, v14 |
| ; GFX7-NEXT: s_add_i32 s4, s4, s12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 |
| ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 |
| ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 |
| ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 |
| ; GFX7-NEXT: s_add_i32 s5, s5, s13 |
| ; GFX7-NEXT: s_add_i32 s6, s6, s14 |
| ; GFX7-NEXT: s_add_i32 s7, s7, s15 |
| ; GFX7-NEXT: s_add_i32 s8, s8, s16 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: s_add_i32 s9, s9, s17 |
| ; GFX7-NEXT: s_add_i32 s10, s10, s18 |
| ; GFX7-NEXT: s_add_i32 s11, s11, s19 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v6, s8 |
| ; GFX7-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX7-NEXT: v_mov_b32_e32 v8, s10 |
| ; GFX7-NEXT: v_mov_b32_e32 v9, s11 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_v8i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v14, 0 |
| ; GFX11-NEXT: s_clause 0x2 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] |
| ; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 |
| ; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s11, v13 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 |
| ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 |
| ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 |
| ; GFX11-NEXT: v_readfirstlane_b32 s15, v17 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 |
| ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 |
| ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s11 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s8 |
| ; GFX11-NEXT: s_add_i32 s1, s1, s9 |
| ; GFX11-NEXT: s_add_i32 s2, s2, s10 |
| ; GFX11-NEXT: s_add_i32 s7, s7, s15 |
| ; GFX11-NEXT: s_add_i32 s4, s4, s12 |
| ; GFX11-NEXT: s_add_i32 s5, s5, s13 |
| ; GFX11-NEXT: s_add_i32 s6, s6, s14 |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v8i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v14, 0 |
| ; GFX12-NEXT: s_clause 0x2 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] |
| ; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 |
| ; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s11, v13 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 |
| ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 |
| ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 |
| ; GFX12-NEXT: v_readfirstlane_b32 s15, v17 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 |
| ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 |
| ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 |
| ; GFX12-NEXT: s_add_co_i32 s3, s3, s11 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s8 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 |
| ; GFX12-NEXT: s_add_co_i32 s2, s2, s10 |
| ; GFX12-NEXT: s_add_co_i32 s7, s7, s15 |
| ; GFX12-NEXT: s_add_co_i32 s4, s4, s12 |
| ; GFX12-NEXT: s_add_co_i32 s5, s5, s13 |
| ; GFX12-NEXT: s_add_co_i32 s6, s6, s14 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <8 x i32>, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile <8 x i32>, ptr addrspace(1) %ptra |
| %sum = add <8 x i32> %a, %b |
| store <8 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 512bit load load, not align 4 or not uniform mmo |
| define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr addrspace(1) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P1_v16i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 |
| ; GFX7-NEXT: s_waitcnt vmcnt(3) |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s8, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s12, v10 |
| ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 |
| ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s16, v14 |
| ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 |
| ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 |
| ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s20, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s21, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s22, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 |
| ; GFX7-NEXT: s_add_i32 s4, s4, s20 |
| ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 |
| ; GFX7-NEXT: s_add_i32 s5, s5, s21 |
| ; GFX7-NEXT: v_readfirstlane_b32 s28, v10 |
| ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 |
| ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 |
| ; GFX7-NEXT: s_add_i32 s6, s6, s22 |
| ; GFX7-NEXT: v_readfirstlane_b32 s33, v14 |
| ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 |
| ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 |
| ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 |
| ; GFX7-NEXT: s_add_i32 s7, s7, s23 |
| ; GFX7-NEXT: s_add_i32 s8, s8, s24 |
| ; GFX7-NEXT: s_add_i32 s12, s12, s28 |
| ; GFX7-NEXT: s_add_i32 s16, s16, s33 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: s_add_i32 s9, s9, s25 |
| ; GFX7-NEXT: s_add_i32 s10, s10, s26 |
| ; GFX7-NEXT: s_add_i32 s11, s11, s27 |
| ; GFX7-NEXT: s_add_i32 s13, s13, s29 |
| ; GFX7-NEXT: s_add_i32 s14, s14, s30 |
| ; GFX7-NEXT: s_add_i32 s15, s15, s31 |
| ; GFX7-NEXT: s_add_i32 s17, s17, s34 |
| ; GFX7-NEXT: s_add_i32 s18, s18, s35 |
| ; GFX7-NEXT: s_add_i32 s19, s19, s36 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v6, s8 |
| ; GFX7-NEXT: v_mov_b32_e32 v10, s12 |
| ; GFX7-NEXT: v_mov_b32_e32 v14, s16 |
| ; GFX7-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX7-NEXT: v_mov_b32_e32 v8, s10 |
| ; GFX7-NEXT: v_mov_b32_e32 v9, s11 |
| ; GFX7-NEXT: v_mov_b32_e32 v11, s13 |
| ; GFX7-NEXT: v_mov_b32_e32 v12, s14 |
| ; GFX7-NEXT: v_mov_b32_e32 v13, s15 |
| ; GFX7-NEXT: v_mov_b32_e32 v15, s17 |
| ; GFX7-NEXT: v_mov_b32_e32 v16, s18 |
| ; GFX7-NEXT: v_mov_b32_e32 v17, s19 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P1_v16i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v30, 0 |
| ; GFX11-NEXT: s_clause 0x4 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v30, s[0:1] |
| ; GFX11-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 |
| ; GFX11-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 |
| ; GFX11-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 |
| ; GFX11-NEXT: global_load_b128 v[18:21], v30, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX11-NEXT: v_readfirstlane_b32 s19, v21 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX11-NEXT: v_readfirstlane_b32 s16, v18 |
| ; GFX11-NEXT: v_readfirstlane_b32 s17, v19 |
| ; GFX11-NEXT: v_readfirstlane_b32 s18, v20 |
| ; GFX11-NEXT: v_readfirstlane_b32 s23, v25 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX11-NEXT: v_readfirstlane_b32 s11, v13 |
| ; GFX11-NEXT: v_readfirstlane_b32 s20, v22 |
| ; GFX11-NEXT: v_readfirstlane_b32 s21, v23 |
| ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 |
| ; GFX11-NEXT: v_readfirstlane_b32 s27, v29 |
| ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 |
| ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 |
| ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 |
| ; GFX11-NEXT: v_readfirstlane_b32 s15, v17 |
| ; GFX11-NEXT: v_readfirstlane_b32 s24, v26 |
| ; GFX11-NEXT: v_readfirstlane_b32 s25, v27 |
| ; GFX11-NEXT: v_readfirstlane_b32 s26, v28 |
| ; GFX11-NEXT: v_readfirstlane_b32 s31, v33 |
| ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 |
| ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 |
| ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 |
| ; GFX11-NEXT: v_readfirstlane_b32 s28, v30 |
| ; GFX11-NEXT: v_readfirstlane_b32 s29, v31 |
| ; GFX11-NEXT: v_readfirstlane_b32 s30, v32 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s19 |
| ; GFX11-NEXT: s_add_i32 s0, s0, s16 |
| ; GFX11-NEXT: s_add_i32 s1, s1, s17 |
| ; GFX11-NEXT: s_add_i32 s2, s2, s18 |
| ; GFX11-NEXT: s_add_i32 s7, s7, s23 |
| ; GFX11-NEXT: s_add_i32 s4, s4, s20 |
| ; GFX11-NEXT: s_add_i32 s5, s5, s21 |
| ; GFX11-NEXT: s_add_i32 s6, s6, s22 |
| ; GFX11-NEXT: s_add_i32 s11, s11, s27 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX11-NEXT: s_add_i32 s8, s8, s24 |
| ; GFX11-NEXT: s_add_i32 s9, s9, s25 |
| ; GFX11-NEXT: s_add_i32 s10, s10, s26 |
| ; GFX11-NEXT: s_add_i32 s15, s15, s31 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 |
| ; GFX11-NEXT: s_add_i32 s12, s12, s28 |
| ; GFX11-NEXT: s_add_i32 s13, s13, s29 |
| ; GFX11-NEXT: s_add_i32 s14, s14, s30 |
| ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 |
| ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 |
| ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 |
| ; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 |
| ; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 |
| ; GFX11-NEXT: v_mov_b32_e32 v14, s12 |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P1_v16i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v30, 0 |
| ; GFX12-NEXT: s_clause 0x4 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v30, s[0:1] |
| ; GFX12-NEXT: global_load_b128 v[6:9], v30, s[0:1] offset:16 |
| ; GFX12-NEXT: global_load_b128 v[10:13], v30, s[0:1] offset:32 |
| ; GFX12-NEXT: global_load_b128 v[14:17], v30, s[0:1] offset:48 |
| ; GFX12-NEXT: global_load_b128 v[18:21], v30, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[22:25], v30, s[0:1] offset:16 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[26:29], v30, s[0:1] offset:32 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX12-NEXT: v_readfirstlane_b32 s19, v21 |
| ; GFX12-NEXT: v_readfirstlane_b32 s7, v9 |
| ; GFX12-NEXT: v_readfirstlane_b32 s16, v18 |
| ; GFX12-NEXT: v_readfirstlane_b32 s17, v19 |
| ; GFX12-NEXT: v_readfirstlane_b32 s18, v20 |
| ; GFX12-NEXT: v_readfirstlane_b32 s23, v25 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 |
| ; GFX12-NEXT: v_readfirstlane_b32 s11, v13 |
| ; GFX12-NEXT: v_readfirstlane_b32 s20, v22 |
| ; GFX12-NEXT: v_readfirstlane_b32 s21, v23 |
| ; GFX12-NEXT: v_readfirstlane_b32 s22, v24 |
| ; GFX12-NEXT: v_readfirstlane_b32 s27, v29 |
| ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 |
| ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 |
| ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 |
| ; GFX12-NEXT: v_readfirstlane_b32 s15, v17 |
| ; GFX12-NEXT: v_readfirstlane_b32 s24, v26 |
| ; GFX12-NEXT: v_readfirstlane_b32 s25, v27 |
| ; GFX12-NEXT: v_readfirstlane_b32 s26, v28 |
| ; GFX12-NEXT: v_readfirstlane_b32 s31, v33 |
| ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 |
| ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 |
| ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 |
| ; GFX12-NEXT: v_readfirstlane_b32 s28, v30 |
| ; GFX12-NEXT: v_readfirstlane_b32 s29, v31 |
| ; GFX12-NEXT: v_readfirstlane_b32 s30, v32 |
| ; GFX12-NEXT: s_add_co_i32 s3, s3, s19 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, s16 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s17 |
| ; GFX12-NEXT: s_add_co_i32 s2, s2, s18 |
| ; GFX12-NEXT: s_add_co_i32 s7, s7, s23 |
| ; GFX12-NEXT: s_add_co_i32 s4, s4, s20 |
| ; GFX12-NEXT: s_add_co_i32 s5, s5, s21 |
| ; GFX12-NEXT: s_add_co_i32 s6, s6, s22 |
| ; GFX12-NEXT: s_add_co_i32 s11, s11, s27 |
| ; GFX12-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX12-NEXT: s_add_co_i32 s8, s8, s24 |
| ; GFX12-NEXT: s_add_co_i32 s9, s9, s25 |
| ; GFX12-NEXT: s_add_co_i32 s10, s10, s26 |
| ; GFX12-NEXT: s_add_co_i32 s15, s15, s31 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 |
| ; GFX12-NEXT: s_add_co_i32 s12, s12, s28 |
| ; GFX12-NEXT: s_add_co_i32 s13, s13, s29 |
| ; GFX12-NEXT: s_add_co_i32 s14, s14, s30 |
| ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 |
| ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 |
| ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 |
| ; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 |
| ; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 |
| ; GFX12-NEXT: v_mov_b32_e32 v14, s12 |
| ; GFX12-NEXT: s_clause 0x3 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <16 x i32>, ptr addrspace(1) %ptra, align 2 |
| %b = load volatile <16 x i32>, ptr addrspace(1) %ptra |
| %sum = add <16 x i32> %a, %b |
| store <16 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| |
| |
| define amdgpu_ps void @load_divergent_P3_i8_any_extending(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_i8_any_extending: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_u8 v1, v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b8 v0, v1 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_divergent_P3_i8_any_extending: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NEXT: ds_load_u8 v1, v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: ds_store_b8 v0, v1 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_divergent_P3_i8_any_extending: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: ds_load_u8 v1, v1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: ds_store_b8 v0, v1 |
| ; GFX12-NEXT: s_endpgm |
| %a = load i8, ptr addrspace(3) %ptra |
| store i8 %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| ; with true16, S16 16-bit load |
| ; without true16, S32 16-bit any-extending load |
| define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_i16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_u16 v1, v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b16 v0, v1 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_divergent_P3_i16: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-True16-NEXT: ds_load_u16_d16 v1, v1 |
| ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v1.l, s0 |
| ; GFX11-True16-NEXT: ds_store_b16 v0, v1 |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_divergent_P3_i16: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NoTrue16-NEXT: ds_load_u16 v1, v1 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NoTrue16-NEXT: ds_store_b16 v0, v1 |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_divergent_P3_i16: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-True16-NEXT: ds_load_u16_d16 v1, v1 |
| ; GFX12-True16-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX12-True16-NEXT: s_wait_alu 0xf1ff |
| ; GFX12-True16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v1.l, s0 |
| ; GFX12-True16-NEXT: ds_store_b16 v0, v1 |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_divergent_P3_i16: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NoTrue16-NEXT: ds_load_u16 v1, v1 |
| ; GFX12-NoTrue16-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NoTrue16-NEXT: ds_store_b16 v0, v1 |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(3) %ptra |
| store i16 %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_divergent_P3_i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_b32 v1, v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b32 v0, v1 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_divergent_P3_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NEXT: ds_load_b32 v1, v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: ds_store_b32 v0, v1 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_divergent_P3_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: ds_load_b32 v1, v1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: ds_store_b32 v0, v1 |
| ; GFX12-NEXT: s_endpgm |
| %a = load i32, ptr addrspace(3) %ptra |
| store i32 %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_divergent_P3_v2i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_v2i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_b64 v[1:2], v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b64 v0, v[1:2] |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_divergent_P3_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: ds_store_b64 v0, v[1:2] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_divergent_P3_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: ds_load_b64 v[1:2], v1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: ds_store_b64 v0, v[1:2] |
| ; GFX12-NEXT: s_endpgm |
| %a = load <2 x i32>, ptr addrspace(3) %ptra |
| store <2 x i32> %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_divergent_P3_v3i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_v3i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_b96 v[1:3], v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b96 v0, v[1:3] |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_divergent_P3_v3i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NEXT: ds_load_b96 v[1:3], v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: ds_store_b96 v0, v[1:3] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_divergent_P3_v3i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: ds_load_b96 v[1:3], v1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: ds_store_b96 v0, v[1:3] |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(3) %ptra |
| store <3 x i32> %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| define amdgpu_ps void @load_divergent_P3_v4i32(ptr addrspace(3) inreg %ptra, ptr addrspace(3) %out) { |
| ; GFX7-LABEL: load_divergent_P3_v4i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX7-NEXT: s_mov_b32 m0, -1 |
| ; GFX7-NEXT: ds_read_b128 v[1:4], v1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: ds_write_b128 v0, v[1:4] |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_divergent_P3_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX11-NEXT: ds_load_b128 v[1:4], v1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: ds_store_b128 v0, v[1:4] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_divergent_P3_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: ds_load_b128 v[1:4], v1 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: ds_store_b128 v0, v[1:4] |
| ; GFX12-NEXT: s_endpgm |
| %a = load <4 x i32>, ptr addrspace(3) %ptra |
| store <4 x i32> %a, ptr addrspace(3) %out |
| ret void |
| } |
| |
| |
| |
| ; constant address space, addrspace(4) |
| ; not uniform load mmo check for G_LOAD is for the case where MMO somehow ends |
| ; up with different addresspace then 4, Don't have tests for it in LLVM-IR. |
| ; %b in tests will end up as uniform load in sgpr |
| |
| ; gfx12 true 16, not natural alignment |
| define amdgpu_ps void @load_uniform_P4_i16_b16_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_i16_b16_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s4, s2 |
| ; GFX7-NEXT: s_mov_b32 s5, s3 |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx12: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[2:3] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[2:3] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[2:3], 0x0 |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[2:3], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra, align 1 |
| %b = load volatile i16, ptr addrspace(4) %ptrb |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11 true16, 16-bit load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_i16_b16_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_i16_b16_gfx11: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P4_i16_b16_gfx11: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_b16_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_clause 0x1 |
| ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_b16_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra |
| %b = load volatile i16, ptr addrspace(4) %ptra, align 4 |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx12 without true16, 16-bit any-extending load, not natural alignment |
| define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: s_clause 0x1 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v3, v2, s[0:1] |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] glc dlc |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v3, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] glc dlc |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx12: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx12: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra, align 1 |
| %b = load volatile i16, ptr addrspace(4) %ptra |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; gfx11(or older) without true 16, s16 any-extending load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_i16_anyextending_gfx11(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_i16_anyextending_gfx11: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: |
| ; GFX11-True16: ; %bb.0: |
| ; GFX11-True16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-True16-NEXT: global_load_d16_b16 v2, v2, s[0:1] |
| ; GFX11-True16-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-True16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-True16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-True16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-True16-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX11-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX11-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-True16-NEXT: s_endpgm |
| ; |
| ; GFX11-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: |
| ; GFX11-NoTrue16: ; %bb.0: |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NoTrue16-NEXT: global_load_u16 v2, v2, s[0:1] |
| ; GFX11-NoTrue16-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NoTrue16-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NoTrue16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NoTrue16-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX11-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX11-NoTrue16-NEXT: s_endpgm |
| ; |
| ; GFX12-True16-LABEL: load_uniform_P4_i16_anyextending_gfx11: |
| ; GFX12-True16: ; %bb.0: |
| ; GFX12-True16-NEXT: s_clause 0x1 |
| ; GFX12-True16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-True16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-True16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-True16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-True16-NEXT: v_mov_b16_e32 v2.l, s0 |
| ; GFX12-True16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-True16-NEXT: s_endpgm |
| ; |
| ; GFX12-NoTrue16-LABEL: load_uniform_P4_i16_anyextending_gfx11: |
| ; GFX12-NoTrue16: ; %bb.0: |
| ; GFX12-NoTrue16-NEXT: s_clause 0x1 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s2, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_load_u16 s0, s[0:1], 0x0 |
| ; GFX12-NoTrue16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NoTrue16-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-NoTrue16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NoTrue16-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NoTrue16-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NoTrue16-NEXT: s_endpgm |
| %a = load i16, ptr addrspace(4) %ptra |
| %b = load volatile i16, ptr addrspace(4) %ptra, align 4 |
| %sum = add i16 %a, %b |
| store i16 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 32-bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_b32 v2, v2, s[0:1] |
| ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_add_i32 s0, s1, s0 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: global_load_b32 v2, v2, s[0:1] |
| ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_endpgm |
| %a = load i32, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile i32, ptr addrspace(4) %ptra |
| %sum = add i32 %a, %b |
| store i32 %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 64bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_v2i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s1, s5, s1 |
| ; GFX7-NEXT: s_add_i32 s0, s4, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_v2i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_add_i32 s0, s2, s0 |
| ; GFX11-NEXT: s_add_i32 s1, s3, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v2i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s2, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v3 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s0, s2, s0 |
| ; GFX12-NEXT: s_add_co_i32 s1, s3, s1 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile <2 x i32>, ptr addrspace(4) %ptra |
| %sum = add <2 x i32> %a, %b |
| store <2 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 96bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_v3i32_gfx12: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx3 v[2:4], off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v4 |
| ; GFX7-NEXT: s_add_i32 s4, s0, s4 |
| ; GFX7-NEXT: s_add_i32 s5, s1, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s7, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx3 v[2:4], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_v3i32_gfx12: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1] |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v4 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v3 |
| ; GFX11-NEXT: s_add_i32 s2, s5, s2 |
| ; GFX11-NEXT: s_add_i32 s0, s3, s0 |
| ; GFX11-NEXT: s_add_i32 s1, s4, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v3i32_gfx12: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v4 |
| ; GFX12-NEXT: v_readfirstlane_b32 s3, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v3 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s2, s5, s2 |
| ; GFX12-NEXT: s_add_co_i32 s0, s3, s0 |
| ; GFX12-NEXT: s_add_co_i32 s1, s4, s1 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <3 x i32>, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile <3 x i32>, ptr addrspace(4) %ptra |
| %sum = add <3 x i32> %a, %b |
| store <3 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 128-bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_v4i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s8, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s9, v5 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s4, s0, s4 |
| ; GFX7-NEXT: s_add_i32 s5, s1, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s8, s6 |
| ; GFX7-NEXT: s_add_i32 s7, s9, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_v4i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1] |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_add_i32 s3, s7, s3 |
| ; GFX11-NEXT: s_add_i32 s0, s4, s0 |
| ; GFX11-NEXT: s_add_i32 s1, s5, s1 |
| ; GFX11-NEXT: s_add_i32 s2, s6, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v4i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1] |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s4, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s5, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s3, s7, s3 |
| ; GFX12-NEXT: s_add_co_i32 s0, s4, s0 |
| ; GFX12-NEXT: s_add_co_i32 s1, s5, s1 |
| ; GFX12-NEXT: s_add_co_i32 s2, s6, s2 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: s_endpgm |
| %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile <4 x i32>, ptr addrspace(4) %ptra |
| %sum = add <4 x i32> %a, %b |
| store <4 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 256bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_v8i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 |
| ; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: v_readfirstlane_b32 s12, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s13, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s14, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s15, v5 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s16, v6 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s4, s12, s4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s17, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s18, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s19, v9 |
| ; GFX7-NEXT: s_add_i32 s5, s13, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s14, s6 |
| ; GFX7-NEXT: s_add_i32 s7, s15, s7 |
| ; GFX7-NEXT: s_add_i32 s8, s16, s8 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: s_add_i32 s9, s17, s9 |
| ; GFX7-NEXT: s_add_i32 s10, s18, s10 |
| ; GFX7-NEXT: s_add_i32 s11, s19, s11 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v6, s8 |
| ; GFX7-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX7-NEXT: v_mov_b32_e32 v8, s10 |
| ; GFX7-NEXT: v_mov_b32_e32 v9, s11 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_v8i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v6, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] |
| ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 |
| ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s11, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s8, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s9, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s10, v4 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s15, v9 |
| ; GFX11-NEXT: v_readfirstlane_b32 s12, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s13, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s14, v8 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_add_i32 s3, s11, s3 |
| ; GFX11-NEXT: s_add_i32 s0, s8, s0 |
| ; GFX11-NEXT: s_add_i32 s1, s9, s1 |
| ; GFX11-NEXT: s_add_i32 s2, s10, s2 |
| ; GFX11-NEXT: s_add_i32 s7, s15, s7 |
| ; GFX11-NEXT: s_add_i32 s4, s12, s4 |
| ; GFX11-NEXT: s_add_i32 s5, s13, s5 |
| ; GFX11-NEXT: s_add_i32 s6, s14, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v8i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v6, 0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] |
| ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 |
| ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x1 |
| ; GFX12-NEXT: v_readfirstlane_b32 s11, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s8, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s9, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s10, v4 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s15, v9 |
| ; GFX12-NEXT: v_readfirstlane_b32 s12, v6 |
| ; GFX12-NEXT: v_readfirstlane_b32 s13, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s14, v8 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s3, s11, s3 |
| ; GFX12-NEXT: s_add_co_i32 s0, s8, s0 |
| ; GFX12-NEXT: s_add_co_i32 s1, s9, s1 |
| ; GFX12-NEXT: s_add_co_i32 s2, s10, s2 |
| ; GFX12-NEXT: s_add_co_i32 s7, s15, s7 |
| ; GFX12-NEXT: s_add_co_i32 s4, s12, s4 |
| ; GFX12-NEXT: s_add_co_i32 s5, s13, s5 |
| ; GFX12-NEXT: s_add_co_i32 s6, s14, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 |
| ; GFX12-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <8 x i32>, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile <8 x i32>, ptr addrspace(4) %ptra |
| %sum = add <8 x i32> %a, %b |
| store <8 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; any target, 512bit load load, not align 4 |
| define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr addrspace(4) inreg %ptrb, ptr addrspace(1) %out) { |
| ; GFX7-LABEL: load_uniform_P4_v16i32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_mov_b32 s2, -1 |
| ; GFX7-NEXT: s_mov_b32 s3, 0xf000 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 |
| ; GFX7-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 |
| ; GFX7-NEXT: s_mov_b32 s2, 0 |
| ; GFX7-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(3) |
| ; GFX7-NEXT: v_readfirstlane_b32 s20, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s21, v3 |
| ; GFX7-NEXT: v_readfirstlane_b32 s22, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 |
| ; GFX7-NEXT: s_waitcnt vmcnt(2) |
| ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: v_readfirstlane_b32 s28, v10 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readfirstlane_b32 s33, v14 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_add_i32 s4, s20, s4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 |
| ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 |
| ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 |
| ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 |
| ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 |
| ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 |
| ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 |
| ; GFX7-NEXT: s_add_i32 s5, s21, s5 |
| ; GFX7-NEXT: s_add_i32 s6, s22, s6 |
| ; GFX7-NEXT: s_add_i32 s7, s23, s7 |
| ; GFX7-NEXT: s_add_i32 s8, s24, s8 |
| ; GFX7-NEXT: s_add_i32 s12, s28, s12 |
| ; GFX7-NEXT: s_add_i32 s16, s33, s16 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: s_add_i32 s9, s25, s9 |
| ; GFX7-NEXT: s_add_i32 s10, s26, s10 |
| ; GFX7-NEXT: s_add_i32 s11, s27, s11 |
| ; GFX7-NEXT: s_add_i32 s13, s29, s13 |
| ; GFX7-NEXT: s_add_i32 s14, s30, s14 |
| ; GFX7-NEXT: s_add_i32 s15, s31, s15 |
| ; GFX7-NEXT: s_add_i32 s17, s34, s17 |
| ; GFX7-NEXT: s_add_i32 s18, s35, s18 |
| ; GFX7-NEXT: s_add_i32 s19, s36, s19 |
| ; GFX7-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX7-NEXT: v_mov_b32_e32 v4, s6 |
| ; GFX7-NEXT: v_mov_b32_e32 v5, s7 |
| ; GFX7-NEXT: v_mov_b32_e32 v6, s8 |
| ; GFX7-NEXT: v_mov_b32_e32 v10, s12 |
| ; GFX7-NEXT: v_mov_b32_e32 v14, s16 |
| ; GFX7-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX7-NEXT: v_mov_b32_e32 v8, s10 |
| ; GFX7-NEXT: v_mov_b32_e32 v9, s11 |
| ; GFX7-NEXT: v_mov_b32_e32 v11, s13 |
| ; GFX7-NEXT: v_mov_b32_e32 v12, s14 |
| ; GFX7-NEXT: v_mov_b32_e32 v13, s15 |
| ; GFX7-NEXT: v_mov_b32_e32 v15, s17 |
| ; GFX7-NEXT: v_mov_b32_e32 v16, s18 |
| ; GFX7-NEXT: v_mov_b32_e32 v17, s19 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[0:3], 0 addr64 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[0:3], 0 addr64 offset:16 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[0:3], 0 addr64 offset:48 |
| ; GFX7-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: load_uniform_P4_v16i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v14, 0 |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_load_b128 v[2:5], v14, s[0:1] |
| ; GFX11-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 |
| ; GFX11-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 |
| ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 |
| ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(3) |
| ; GFX11-NEXT: v_readfirstlane_b32 s19, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s16, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s17, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s18, v4 |
| ; GFX11-NEXT: s_waitcnt vmcnt(2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s23, v9 |
| ; GFX11-NEXT: v_readfirstlane_b32 s20, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s21, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s22, v8 |
| ; GFX11-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s27, v13 |
| ; GFX11-NEXT: v_readfirstlane_b32 s24, v10 |
| ; GFX11-NEXT: v_readfirstlane_b32 s25, v11 |
| ; GFX11-NEXT: v_readfirstlane_b32 s26, v12 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_readfirstlane_b32 s31, v17 |
| ; GFX11-NEXT: v_readfirstlane_b32 s28, v14 |
| ; GFX11-NEXT: v_readfirstlane_b32 s29, v15 |
| ; GFX11-NEXT: v_readfirstlane_b32 s30, v16 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_add_i32 s3, s19, s3 |
| ; GFX11-NEXT: s_add_i32 s0, s16, s0 |
| ; GFX11-NEXT: s_add_i32 s1, s17, s1 |
| ; GFX11-NEXT: s_add_i32 s2, s18, s2 |
| ; GFX11-NEXT: s_add_i32 s7, s23, s7 |
| ; GFX11-NEXT: s_add_i32 s4, s20, s4 |
| ; GFX11-NEXT: s_add_i32 s5, s21, s5 |
| ; GFX11-NEXT: s_add_i32 s6, s22, s6 |
| ; GFX11-NEXT: s_add_i32 s11, s27, s11 |
| ; GFX11-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX11-NEXT: s_add_i32 s8, s24, s8 |
| ; GFX11-NEXT: s_add_i32 s9, s25, s9 |
| ; GFX11-NEXT: s_add_i32 s10, s26, s10 |
| ; GFX11-NEXT: s_add_i32 s15, s31, s15 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 |
| ; GFX11-NEXT: s_add_i32 s12, s28, s12 |
| ; GFX11-NEXT: s_add_i32 s13, s29, s13 |
| ; GFX11-NEXT: s_add_i32 s14, s30, s14 |
| ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 |
| ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 |
| ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 |
| ; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 |
| ; GFX11-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 |
| ; GFX11-NEXT: v_mov_b32_e32 v14, s12 |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: load_uniform_P4_v16i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: v_mov_b32_e32 v14, 0 |
| ; GFX12-NEXT: s_clause 0x3 |
| ; GFX12-NEXT: global_load_b128 v[2:5], v14, s[0:1] |
| ; GFX12-NEXT: global_load_b128 v[6:9], v14, s[0:1] offset:16 |
| ; GFX12-NEXT: global_load_b128 v[10:13], v14, s[0:1] offset:32 |
| ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 |
| ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s19, v5 |
| ; GFX12-NEXT: v_readfirstlane_b32 s16, v2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s17, v3 |
| ; GFX12-NEXT: v_readfirstlane_b32 s18, v4 |
| ; GFX12-NEXT: s_wait_loadcnt 0x2 |
| ; GFX12-NEXT: v_readfirstlane_b32 s23, v9 |
| ; GFX12-NEXT: v_readfirstlane_b32 s20, v6 |
| ; GFX12-NEXT: v_readfirstlane_b32 s21, v7 |
| ; GFX12-NEXT: v_readfirstlane_b32 s22, v8 |
| ; GFX12-NEXT: s_wait_loadcnt 0x1 |
| ; GFX12-NEXT: v_readfirstlane_b32 s27, v13 |
| ; GFX12-NEXT: v_readfirstlane_b32 s24, v10 |
| ; GFX12-NEXT: v_readfirstlane_b32 s25, v11 |
| ; GFX12-NEXT: v_readfirstlane_b32 s26, v12 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s31, v17 |
| ; GFX12-NEXT: v_readfirstlane_b32 s28, v14 |
| ; GFX12-NEXT: v_readfirstlane_b32 s29, v15 |
| ; GFX12-NEXT: v_readfirstlane_b32 s30, v16 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s3, s19, s3 |
| ; GFX12-NEXT: s_add_co_i32 s0, s16, s0 |
| ; GFX12-NEXT: s_add_co_i32 s1, s17, s1 |
| ; GFX12-NEXT: s_add_co_i32 s2, s18, s2 |
| ; GFX12-NEXT: s_add_co_i32 s7, s23, s7 |
| ; GFX12-NEXT: s_add_co_i32 s4, s20, s4 |
| ; GFX12-NEXT: s_add_co_i32 s5, s21, s5 |
| ; GFX12-NEXT: s_add_co_i32 s6, s22, s6 |
| ; GFX12-NEXT: s_add_co_i32 s11, s27, s11 |
| ; GFX12-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX12-NEXT: s_add_co_i32 s8, s24, s8 |
| ; GFX12-NEXT: s_add_co_i32 s9, s25, s9 |
| ; GFX12-NEXT: s_add_co_i32 s10, s26, s10 |
| ; GFX12-NEXT: s_add_co_i32 s15, s31, s15 |
| ; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 |
| ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 |
| ; GFX12-NEXT: s_add_co_i32 s12, s28, s12 |
| ; GFX12-NEXT: s_add_co_i32 s13, s29, s13 |
| ; GFX12-NEXT: s_add_co_i32 s14, s30, s14 |
| ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 |
| ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 |
| ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 |
| ; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v17, s15 |
| ; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v15, s13 |
| ; GFX12-NEXT: v_mov_b32_e32 v14, s12 |
| ; GFX12-NEXT: s_clause 0x3 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 |
| ; GFX12-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 |
| ; GFX12-NEXT: s_endpgm |
| %a = load <16 x i32>, ptr addrspace(4) %ptra, align 2 |
| %b = load volatile <16 x i32>, ptr addrspace(4) %ptra |
| %sum = add <16 x i32> %a, %b |
| store <16 x i32> %sum, ptr addrspace(1) %out |
| ret void |
| } |