| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 |
| ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s |
| ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s |
| ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s |
| ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s |
| |
| define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { |
| ; GFX90A-LABEL: test_insert_extract: |
| ; GFX90A: ; %bb.0: ; %entry |
| ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 |
| ; GFX90A-NEXT: s_mov_b32 s2, 0 |
| ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 |
| ; GFX90A-NEXT: s_mov_b32 s3, 0 |
| ; GFX90A-NEXT: s_mov_b32 s4, 0 |
| ; GFX90A-NEXT: s_mov_b32 s5, 0 |
| ; GFX90A-NEXT: s_mov_b32 s6, 0 |
| ; GFX90A-NEXT: .LBB0_1: ; %for.body |
| ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 |
| ; GFX90A-NEXT: s_or_b32 s7, s7, s0 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec |
| ; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec |
| ; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 |
| ; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec |
| ; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 |
| ; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 |
| ; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] |
| ; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] |
| ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 |
| ; GFX90A-NEXT: s_mov_b64 vcc, vcc |
| ; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 |
| ; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock |
| ; GFX90A-NEXT: s_endpgm |
| ; |
| ; GFX942-LABEL: test_insert_extract: |
| ; GFX942: ; %bb.0: ; %entry |
| ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 |
| ; GFX942-NEXT: s_mov_b32 s2, 0 |
| ; GFX942-NEXT: s_and_b64 vcc, exec, -1 |
| ; GFX942-NEXT: s_mov_b32 s3, 0 |
| ; GFX942-NEXT: s_mov_b32 s4, 0 |
| ; GFX942-NEXT: s_mov_b32 s5, 0 |
| ; GFX942-NEXT: s_mov_b32 s6, 0 |
| ; GFX942-NEXT: .LBB0_1: ; %for.body |
| ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX942-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX942-NEXT: s_cselect_b32 s7, s4, s3 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX942-NEXT: s_cselect_b32 s7, s5, s7 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX942-NEXT: s_cselect_b32 s7, s6, s7 |
| ; GFX942-NEXT: s_or_b32 s7, s7, s0 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec |
| ; GFX942-NEXT: s_cselect_b32 s4, s7, s4 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec |
| ; GFX942-NEXT: s_cselect_b32 s6, s7, s6 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 |
| ; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec |
| ; GFX942-NEXT: s_cselect_b32 s5, s7, s5 |
| ; GFX942-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX942-NEXT: s_cselect_b32 s3, s7, s3 |
| ; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] |
| ; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] |
| ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX942-NEXT: s_cselect_b32 s2, 0, s2 |
| ; GFX942-NEXT: s_mov_b64 vcc, vcc |
| ; GFX942-NEXT: s_cbranch_vccnz .LBB0_1 |
| ; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock |
| ; GFX942-NEXT: s_endpgm |
| ; |
| ; GFX1030-LABEL: test_insert_extract: |
| ; GFX1030: ; %bb.0: ; %entry |
| ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 |
| ; GFX1030-NEXT: s_mov_b32 s2, 0 |
| ; GFX1030-NEXT: s_mov_b32 s3, 0 |
| ; GFX1030-NEXT: s_mov_b32 s4, 0 |
| ; GFX1030-NEXT: s_mov_b32 s5, 0 |
| ; GFX1030-NEXT: s_mov_b32 s6, 0 |
| ; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo |
| ; GFX1030-NEXT: .p2align 6 |
| ; GFX1030-NEXT: .LBB0_1: ; %for.body |
| ; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 |
| ; GFX1030-NEXT: s_or_b32 s7, s7, s0 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 |
| ; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 |
| ; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 |
| ; GFX1030-NEXT: s_or_b32 s7, s10, s8 |
| ; GFX1030-NEXT: s_or_b32 s7, s9, s7 |
| ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo |
| ; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 |
| ; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 |
| ; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock |
| ; GFX1030-NEXT: s_endpgm |
| ; |
| ; GFX1100-LABEL: test_insert_extract: |
| ; GFX1100: ; %bb.0: ; %entry |
| ; GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 |
| ; GFX1100-NEXT: s_mov_b32 s2, 0 |
| ; GFX1100-NEXT: s_mov_b32 s3, 0 |
| ; GFX1100-NEXT: s_mov_b32 s4, 0 |
| ; GFX1100-NEXT: s_mov_b32 s5, 0 |
| ; GFX1100-NEXT: s_mov_b32 s6, 0 |
| ; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo |
| ; GFX1100-NEXT: .p2align 6 |
| ; GFX1100-NEXT: .LBB0_1: ; %for.body |
| ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 |
| ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) |
| ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 |
| ; GFX1100-NEXT: s_or_b32 s7, s7, s0 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 |
| ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 |
| ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 |
| ; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 |
| ; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 |
| ; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 |
| ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) |
| ; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 |
| ; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 |
| ; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 |
| ; GFX1100-NEXT: s_or_b32 s7, s10, s8 |
| ; GFX1100-NEXT: s_or_b32 s7, s9, s7 |
| ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo |
| ; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 |
| ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 |
| ; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock |
| ; GFX1100-NEXT: s_endpgm |
| entry: |
| %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0 |
| br label %for.body |
| |
| for.body: ; preds = %for.body, %entry |
| %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ] |
| %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ] |
| %idxprom = zext i32 %q to i64 |
| %e1 = extractelement <4 x i32> %x2, i64 %idxprom |
| %add = or i32 %e1, %p |
| %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom |
| %e3 = extractelement <4 x i32> %x1, i64 %idxprom |
| %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0 |
| br label %for.body |
| } |
| |