| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX12 %s |
| |
| ; Test that extractelement and shufflevector operations on v16i8 loads get |
| ; optimized away by DAGCombiner, showing that these operations are "free" |
| ; in terms of generated instructions. |
| |
| @lds = external addrspace(3) global [0 x i8], align 16 |
| |
| ; Multiple extract elements keep the full ds_read_b128. |
| define void @extract_multiple_v16i8(ptr addrspace(1) %out) { |
| ; GFX9-LABEL: extract_multiple_v16i8: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b128 v[2:5], v2 |
| ; GFX9-NEXT: s_mov_b32 s0, 0xc0c0004 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_perm_b32 v2, v2, v3, s0 |
| ; GFX9-NEXT: v_perm_b32 v3, v4, v5, s0 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 |
| ; GFX9-NEXT: global_store_dword v[0:1], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: extract_multiple_v16i8: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: ds_load_b128 v[2:5], v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_perm_b32 v4, v4, v5, 0xc0c0004 |
| ; GFX12-NEXT: v_perm_b32 v2, v2, v3, 0xc0c0004 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 |
| ; GFX12-NEXT: v_or_b32_e32 v2, v2, v3 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_set_pc_i64 s[30:31] |
| %ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0 |
| %val = load <16 x i8>, ptr addrspace(3) %ptr, align 16 |
| %e0 = extractelement <16 x i8> %val, i32 0 |
| %e4 = extractelement <16 x i8> %val, i32 4 |
| %e8 = extractelement <16 x i8> %val, i32 8 |
| %e12 = extractelement <16 x i8> %val, i32 12 |
| %out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0 |
| store i8 %e0, ptr addrspace(1) %out0, align 1 |
| %out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 |
| store i8 %e4, ptr addrspace(1) %out1, align 1 |
| %out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2 |
| store i8 %e8, ptr addrspace(1) %out2, align 1 |
| %out3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 3 |
| store i8 %e12, ptr addrspace(1) %out3, align 1 |
| ret void |
| } |
| |
| ; Multiple extract elements to keep the full ds_read_b64. |
| define void @extract_multiple_v8i8(ptr addrspace(1) %out) { |
| ; GFX9-LABEL: extract_multiple_v8i8: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b64 v[2:3], v2 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX9-NEXT: s_and_b32 s1, s0, 0xff |
| ; GFX9-NEXT: s_lshr_b32 s0, s0, 24 |
| ; GFX9-NEXT: s_lshl_b32 s0, s0, 8 |
| ; GFX9-NEXT: s_or_b32 s0, s1, s0 |
| ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX9-NEXT: v_or_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD |
| ; GFX9-NEXT: global_store_dword v[0:1], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: extract_multiple_v8i8: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: ds_load_b64 v[2:3], v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 |
| ; GFX12-NEXT: s_lshr_b32 s1, s0, 16 |
| ; GFX12-NEXT: s_and_b32 s0, s0, 0xff |
| ; GFX12-NEXT: s_and_b32 s1, s1, 0xff00 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: s_or_b32 s0, s0, s1 |
| ; GFX12-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_or_b32_e32 v2, s0, v2 |
| ; GFX12-NEXT: global_store_b32 v[0:1], v2, off |
| ; GFX12-NEXT: s_set_pc_i64 s[30:31] |
| %ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0 |
| %val = load <8 x i8>, ptr addrspace(3) %ptr, align 16 |
| %e0 = extractelement <8 x i8> %val, i32 0 |
| %e4 = extractelement <8 x i8> %val, i32 1 |
| %e8 = extractelement <8 x i8> %val, i32 4 |
| %e12 = extractelement <8 x i8> %val, i32 7 |
| %out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0 |
| store i8 %e0, ptr addrspace(1) %out0, align 1 |
| %out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 |
| store i8 %e4, ptr addrspace(1) %out1, align 1 |
| %out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2 |
| store i8 %e8, ptr addrspace(1) %out2, align 1 |
| %out3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 3 |
| store i8 %e12, ptr addrspace(1) %out3, align 1 |
| ret void |
| } |
| |
| ; Multiple extract elements keep the 32-bit load. |
| define void @extract_multiple_v4i8(ptr addrspace(1) %out) { |
| ; GFX9-LABEL: extract_multiple_v4i8: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: ds_read_b32 v2, v2 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:2 |
| ; GFX9-NEXT: global_store_short v[0:1], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: extract_multiple_v4i8: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: ds_load_b32 v2, v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: global_store_d16_hi_b8 v[0:1], v2, off offset:2 |
| ; GFX12-NEXT: global_store_b16 v[0:1], v2, off |
| ; GFX12-NEXT: s_set_pc_i64 s[30:31] |
| %ptr = getelementptr inbounds i8, ptr addrspace(3) @lds, i32 0 |
| %val = load <4 x i8>, ptr addrspace(3) %ptr, align 4 |
| %e0 = extractelement <4 x i8> %val, i32 0 |
| %e1 = extractelement <4 x i8> %val, i32 1 |
| %e2 = extractelement <4 x i8> %val, i32 2 |
| %out0 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 0 |
| store i8 %e0, ptr addrspace(1) %out0, align 1 |
| %out1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 |
| store i8 %e1, ptr addrspace(1) %out1, align 1 |
| %out2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2 |
| store i8 %e2, ptr addrspace(1) %out2, align 1 |
| ret void |
| } |