blob: 217868d5b7b597f2afa75dd4c7b43cc43607a255 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX12 %s
; Test that extractelement and shufflevector operations on v16i8 loads get
; optimized away by DAGCombiner, showing that these operations are "free"
; in terms of generated instructions.
@lds = external addrspace(3) global [0 x i8], align 16
; Multiple extract elements keep the full ds_read_b128.
define void @extract_multiple_v16i8(ptr addrspace(1) %out) {
; GFX9-LABEL: extract_multiple_v16i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_read_b128 v[2:5], v2
; GFX9-NEXT: s_mov_b32 s0, 0xc0c0004
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_perm_b32 v2, v2, v3, s0
; GFX9-NEXT: v_perm_b32 v3, v4, v5, s0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: extract_multiple_v16i8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: ds_load_b128 v[2:5], v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_perm_b32 v4, v4, v5, 0xc0c0004
; GFX12-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX12-NEXT: v_or_b32_e32 v2, v2, v3
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0
%val = load <16 x i8>, ptr addrspace(3) %ptr, align 16
%e0 = extractelement <16 x i8> %val, i32 0
%e4 = extractelement <16 x i8> %val, i32 4
%e8 = extractelement <16 x i8> %val, i32 8
%e12 = extractelement <16 x i8> %val, i32 12
%out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0
store i8 %e0, ptr addrspace(1) %out0, align 1
%out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
store i8 %e4, ptr addrspace(1) %out1, align 1
%out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2
store i8 %e8, ptr addrspace(1) %out2, align 1
%out3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 3
store i8 %e12, ptr addrspace(1) %out3, align 1
ret void
}
; Multiple extract elements to keep the full ds_read_b64.
define void @extract_multiple_v8i8(ptr addrspace(1) %out) {
; GFX9-LABEL: extract_multiple_v8i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_read_b64 v[2:3], v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v3
; GFX9-NEXT: s_and_b32 s1, s0, 0xff
; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: v_or_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: extract_multiple_v8i8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: ds_load_b64 v[2:3], v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xff
; GFX12-NEXT: s_and_b32 s1, s1, 0xff00
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b32 s0, s0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_or_b32_e32 v2, s0, v2
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0
%val = load <8 x i8>, ptr addrspace(3) %ptr, align 16
%e0 = extractelement <8 x i8> %val, i32 0
%e4 = extractelement <8 x i8> %val, i32 1
%e8 = extractelement <8 x i8> %val, i32 4
%e12 = extractelement <8 x i8> %val, i32 7
%out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0
store i8 %e0, ptr addrspace(1) %out0, align 1
%out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
store i8 %e4, ptr addrspace(1) %out1, align 1
%out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2
store i8 %e8, ptr addrspace(1) %out2, align 1
%out3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 3
store i8 %e12, ptr addrspace(1) %out3, align 1
ret void
}
; Multiple extract elements keep the 32-bit load.
define void @extract_multiple_v4i8(ptr addrspace(1) %out) {
; GFX9-LABEL: extract_multiple_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_read_b32 v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: extract_multiple_v4i8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: ds_load_b32 v2, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_d16_hi_b8 v[0:1], v2, off offset:2
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0
%val = load <4 x i8>, ptr addrspace(3) %ptr, align 4
%e0 = extractelement <4 x i8> %val, i32 0
%e1 = extractelement <4 x i8> %val, i32 1
%e2 = extractelement <4 x i8> %val, i32 2
%out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0
store i8 %e0, ptr addrspace(1) %out0, align 1
%out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
store i8 %e1, ptr addrspace(1) %out1, align 1
%out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2
store i8 %e2, ptr addrspace(1) %out2, align 1
ret void
}