| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -check-prefix=GFX1251 %s |
| |
| define <2 x i64> @pk_lshl_add_u64_v1v(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_v1v: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_mov_b32 s0, 1 |
| ; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0 |
| ; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7] |
| ; GFX1251-NEXT: s_set_pc_i64 s[30:31] |
| %shl = shl <2 x i64> %v, <i64 1, i64 1> |
| %add = add <2 x i64> %shl, %a |
| ret <2 x i64> %add |
| } |
| |
| define <2 x i64> @pk_lshl_add_u64_v4_5v(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_v4_5v: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], 5, v[2:3] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], 4, v[0:1] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-NEXT: s_set_pc_i64 s[30:31] |
| %shl = shl <2 x i64> %v, <i64 4, i64 5> |
| %add = add <2 x i64> %shl, %a |
| ret <2 x i64> %add |
| } |
| |
| define <2 x i64> @pk_lshl_add_u64_vvv(<2 x i64> %v, <2 x i64> %s, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_vvv: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v4, v[0:1] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11] |
| ; GFX1251-NEXT: s_set_pc_i64 s[30:31] |
| %shl = shl <2 x i64> %v, %s |
| %add = add <2 x i64> %shl, %a |
| ret <2 x i64> %add |
| } |
| |
| define amdgpu_kernel void @pk_lshl_add_u64_s2v(<2 x i64> %v) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_s2v: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1] |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-NEXT: s_mov_b32 s4, 2 |
| ; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) |
| ; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %a = load <2 x i64>, ptr poison |
| %shl = shl <2 x i64> %v, <i64 2, i64 2> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |
| |
| define amdgpu_kernel void @pk_lshl_add_u64_v2s(<2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_v2s: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1] |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-NEXT: s_mov_b32 s4, 2 |
| ; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) |
| ; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %v = load <2 x i64>, ptr poison |
| %shl = shl <2 x i64> %v, <i64 2, i64 2> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |
| |
| define amdgpu_kernel void @pk_lshl_add_u64_s2s(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_s2s: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 2 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %shl = shl <2 x i64> %v, <i64 2, i64 2> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |
| |
| define i32 @pk_lshl_add_u64_gep(<2 x ptr> %p, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_gep: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_mov_b32 s0, 2 |
| ; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0 |
| ; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3] |
| ; GFX1251-NEXT: flat_load_b32 v4, v[0:1] |
| ; GFX1251-NEXT: flat_load_b32 v5, v[2:3] |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_xcnt 0x1 |
| ; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5 |
| ; GFX1251-NEXT: s_set_pc_i64 s[30:31] |
| %gep = getelementptr inbounds i32, <2 x ptr> %p, <2 x i64> %a |
| %gep0 = extractelement <2 x ptr> %gep, i32 0 |
| %gep1 = extractelement <2 x ptr> %gep, i32 1 |
| %v0 = load i32, ptr %gep0 |
| %v1 = load i32, ptr %gep1 |
| %v = add i32 %v0, %v1 |
| ret i32 %v |
| } |
| |
| define i32 @pk_lshl_add_u64_maybe_oob(<2 x ptr> %p, <2 x i32> %i) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_maybe_oob: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_ashrrev_i32 v5, 31, v4 |
| ; GFX1251-NEXT: s_mov_b32 s0, 2 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) |
| ; GFX1251-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_ashrrev_i32 v7, 31, v6 |
| ; GFX1251-NEXT: v_mov_b32_e32 v8, s0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3] |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], 12 |
| ; GFX1251-NEXT: flat_load_b32 v4, v[0:1] |
| ; GFX1251-NEXT: flat_load_b32 v5, v[2:3] |
| ; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX1251-NEXT: s_wait_xcnt 0x1 |
| ; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5 |
| ; GFX1251-NEXT: s_set_pc_i64 s[30:31] |
| %idx = add nsw <2 x i32> %i, <i32 3, i32 3> |
| %gep = getelementptr i32, <2 x ptr> %p, <2 x i32> %idx |
| %gep0 = extractelement <2 x ptr> %gep, i32 0 |
| %gep1 = extractelement <2 x ptr> %gep, i32 1 |
| %v0 = load i32, ptr %gep0 |
| %v1 = load i32, ptr %gep1 |
| %v = add i32 %v0, %v1 |
| ret i32 %v |
| } |
| |
| define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_3(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_3: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %shl = shl <2 x i64> %v, <i64 2, i64 3> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |
| |
| ; FIXME: That shall be possible to use v_pk_lshl_add_u64 here, |
| ; but ComputeKnownBits does not understand this vector with shift amounts. |
| define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_4(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_4: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 4 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %shl = shl <2 x i64> %v, <i64 2, i64 4> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |
| |
| define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_5(<2 x i64> %v, <2 x i64> %a) { |
| ; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_5: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 5 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3] |
| ; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3] |
| ; GFX1251-NEXT: s_endpgm |
| %shl = shl <2 x i64> %v, <i64 2, i64 5> |
| %add = add <2 x i64> %shl, %a |
| store <2 x i64> %add, ptr poison |
| ret void |
| } |