blob: c4d2541e7fcf899ae8987b9daf1294b02ea02d1b [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -check-prefix=GFX1251 %s
define <2 x i64> @pk_lshl_add_u64_v1v(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_v1v:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_mov_b32 s0, 1
; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0
; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7]
; GFX1251-NEXT: s_set_pc_i64 s[30:31]
%shl = shl <2 x i64> %v, <i64 1, i64 1>
%add = add <2 x i64> %shl, %a
ret <2 x i64> %add
}
define <2 x i64> @pk_lshl_add_u64_v4_5v(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_v4_5v:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], 5, v[2:3]
; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], 4, v[0:1]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-NEXT: s_set_pc_i64 s[30:31]
%shl = shl <2 x i64> %v, <i64 4, i64 5>
%add = add <2 x i64> %shl, %a
ret <2 x i64> %add
}
define <2 x i64> @pk_lshl_add_u64_vvv(<2 x i64> %v, <2 x i64> %s, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_vvv:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v4, v[0:1]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11]
; GFX1251-NEXT: s_set_pc_i64 s[30:31]
%shl = shl <2 x i64> %v, %s
%add = add <2 x i64> %shl, %a
ret <2 x i64> %add
}
define amdgpu_kernel void @pk_lshl_add_u64_s2v(<2 x i64> %v) {
; GFX1251-LABEL: pk_lshl_add_u64_s2v:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1]
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_xcnt 0x0
; GFX1251-NEXT: s_mov_b32 s4, 2
; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%a = load <2 x i64>, ptr poison
%shl = shl <2 x i64> %v, <i64 2, i64 2>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}
define amdgpu_kernel void @pk_lshl_add_u64_v2s(<2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_v2s:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: flat_load_b128 v[0:3], v[0:1]
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_xcnt 0x0
; GFX1251-NEXT: s_mov_b32 s4, 2
; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1251-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s4
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[0:3], v[8:9], v[4:7]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%v = load <2 x i64>, ptr poison
%shl = shl <2 x i64> %v, <i64 2, i64 2>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}
define amdgpu_kernel void @pk_lshl_add_u64_s2s(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_s2s:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 2
; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%shl = shl <2 x i64> %v, <i64 2, i64 2>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}
define i32 @pk_lshl_add_u64_gep(<2 x ptr> %p, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_gep:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_mov_b32 s0, 2
; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1251-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s0
; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3]
; GFX1251-NEXT: flat_load_b32 v4, v[0:1]
; GFX1251-NEXT: flat_load_b32 v5, v[2:3]
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_xcnt 0x1
; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5
; GFX1251-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr inbounds i32, <2 x ptr> %p, <2 x i64> %a
%gep0 = extractelement <2 x ptr> %gep, i32 0
%gep1 = extractelement <2 x ptr> %gep, i32 1
%v0 = load i32, ptr %gep0
%v1 = load i32, ptr %gep1
%v = add i32 %v0, %v1
ret i32 %v
}
define i32 @pk_lshl_add_u64_maybe_oob(<2 x ptr> %p, <2 x i32> %i) {
; GFX1251-LABEL: pk_lshl_add_u64_maybe_oob:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_ashrrev_i32 v5, 31, v4
; GFX1251-NEXT: s_mov_b32 s0, 2
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1251-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_ashrrev_i32 v7, 31, v6
; GFX1251-NEXT: v_mov_b32_e32 v8, s0
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1251-NEXT: v_pk_lshl_add_u64 v[0:3], v[4:7], v[8:9], v[0:3]
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], 12
; GFX1251-NEXT: flat_load_b32 v4, v[0:1]
; GFX1251-NEXT: flat_load_b32 v5, v[2:3]
; GFX1251-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1251-NEXT: s_wait_xcnt 0x1
; GFX1251-NEXT: v_add_nc_u32_e32 v0, v4, v5
; GFX1251-NEXT: s_set_pc_i64 s[30:31]
%idx = add nsw <2 x i32> %i, <i32 3, i32 3>
%gep = getelementptr i32, <2 x ptr> %p, <2 x i32> %idx
%gep0 = extractelement <2 x ptr> %gep, i32 0
%gep1 = extractelement <2 x ptr> %gep, i32 1
%v0 = load i32, ptr %gep0
%v1 = load i32, ptr %gep1
%v = add i32 %v0, %v1
ret i32 %v
}
define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_3(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_3:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%shl = shl <2 x i64> %v, <i64 2, i64 3>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}
; FIXME: That shall be possible to use v_pk_lshl_add_u64 here,
; but ComputeKnownBits does not understand this vector with shift amounts.
define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_4(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_4:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 4
; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%shl = shl <2 x i64> %v, <i64 2, i64 4>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}
define amdgpu_kernel void @pk_lshl_add_u64_s2s_shift2_5(<2 x i64> %v, <2 x i64> %a) {
; GFX1251-LABEL: pk_lshl_add_u64_s2s_shift2_5:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], 2
; GFX1251-NEXT: s_lshl_b64 s[2:3], s[10:11], 5
; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX1251-NEXT: s_endpgm
%shl = shl <2 x i64> %v, <i64 2, i64 5>
%add = add <2 x i64> %shl, %a
store <2 x i64> %add, ptr poison
ret void
}