blob: bbd87fad143e344cce252521333e923ecd12ce9b [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1251,GFX1251-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1251,GFX1251-GISEL %s
define amdgpu_kernel void @add_v2_vv(ptr addrspace(1) %a) {
; GFX1251-LABEL: add_v2_vv:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
; GFX1251-NEXT: s_wait_loadcnt 0x0
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[0:3]
; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, %load
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_vs(ptr addrspace(1) %a, <2 x i64> %x) {
; GFX1251-LABEL: add_v2_vs:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_clause 0x1
; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: s_wait_loadcnt 0x0
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, %x
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_ss(ptr addrspace(1) %a, <2 x i64> %x, <2 x i64> %y) {
; GFX1251-SDAG-LABEL: add_v2_ss:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s8
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, s15
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_ss:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: s_endpgm
%add = add <2 x i64> %x, %y
store <2 x i64> %add, ptr addrspace(1) %a, align 8
ret void
}
define amdgpu_kernel void @add_v4_vs(ptr addrspace(1) %a, <4 x i64> %x) {
; GFX1251-SDAG-LABEL: add_v4_vs:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[4:7], v[4:7], v[8:11]
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1251-SDAG-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v4_vs:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[8:11]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[4:7], v[4:7], v[12:15]
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <4 x i64>, ptr addrspace(1) %gep, align 16
%add = add <4 x i64> %load, %x
store <4 x i64> %add, ptr addrspace(1) %gep, align 16
ret void
}
define amdgpu_kernel void @add_v32_vs(ptr addrspace(1) %a, <32 x i64> %x) {
; GFX1251-SDAG-LABEL: add_v32_vs:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_clause 0x3
; GFX1251-SDAG-NEXT: s_load_b512 s[68:83], s[4:5], 0x1a4 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[52:67], s[4:5], 0x1e4 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x124 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0x164 nv
; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v44, 8, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: s_clause 0xf
; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v44, s[0:1] offset:144
; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v44, s[0:1] offset:128
; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v44, s[0:1] offset:176
; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v44, s[0:1] offset:160
; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v44, s[0:1] offset:208
; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v44, s[0:1] offset:192
; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v44, s[0:1] offset:240
; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v44, s[0:1] offset:224
; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v44, s[0:1] offset:16
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v44, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v44, s[0:1] offset:48
; GFX1251-SDAG-NEXT: global_load_b128 v[46:49], v44, s[0:1] offset:64
; GFX1251-SDAG-NEXT: global_load_b128 v[50:53], v44, s[0:1] offset:112
; GFX1251-SDAG-NEXT: global_load_b128 v[54:57], v44, s[0:1] offset:96
; GFX1251-SDAG-NEXT: global_load_b128 v[58:61], v44, s[0:1] offset:80
; GFX1251-SDAG-NEXT: global_load_b128 v[62:65], v44, s[0:1] offset:32
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s72 :: v_dual_mov_b32 v67, s73
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s74 :: v_dual_mov_b32 v69, s75
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s68 :: v_dual_mov_b32 v71, s69
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s70 :: v_dual_mov_b32 v73, s71
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xf
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[40:43], v[40:43], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s80 :: v_dual_mov_b32 v67, s81
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s82 :: v_dual_mov_b32 v69, s83
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xe
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[36:39], v[36:39], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s76 :: v_dual_mov_b32 v71, s77
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s78 :: v_dual_mov_b32 v73, s79
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[32:35], v[32:35], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s56 :: v_dual_mov_b32 v67, s57
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s58 :: v_dual_mov_b32 v69, s59
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[28:31], v[28:31], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s52 :: v_dual_mov_b32 v71, s53
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s54 :: v_dual_mov_b32 v73, s55
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[24:27], v[24:27], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s64 :: v_dual_mov_b32 v67, s65
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s66 :: v_dual_mov_b32 v69, s67
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[20:23], v[20:23], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s60 :: v_dual_mov_b32 v71, s61
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s62 :: v_dual_mov_b32 v73, s63
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[16:19], v[16:19], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s13
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s14 :: v_dual_mov_b32 v69, s15
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[12:15], v[12:15], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s9
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s10 :: v_dual_mov_b32 v73, s11
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[8:11], v[8:11], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s20 :: v_dual_mov_b32 v67, s21
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s22 :: v_dual_mov_b32 v69, s23
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[4:7], v[4:7], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s36 :: v_dual_mov_b32 v71, s37
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s38 :: v_dual_mov_b32 v73, s39
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s48 :: v_dual_mov_b32 v67, s49
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s50 :: v_dual_mov_b32 v69, s51
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[46:49], v[46:49], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s44 :: v_dual_mov_b32 v71, s45
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s46 :: v_dual_mov_b32 v73, s47
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[50:53], v[50:53], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s40 :: v_dual_mov_b32 v67, s41
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s42 :: v_dual_mov_b32 v69, s43
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[54:57], v[54:57], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s16 :: v_dual_mov_b32 v71, s17
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s18 :: v_dual_mov_b32 v73, s19
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[58:61], v[58:61], v[66:69]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[62:65], v[62:65], v[70:73]
; GFX1251-SDAG-NEXT: s_clause 0xf
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[54:57], s[0:1] offset:96
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[50:53], s[0:1] offset:112
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[46:49], s[0:1] offset:64
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[58:61], s[0:1] offset:80
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[62:65], s[0:1] offset:32
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[0:3], s[0:1] offset:48
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[4:7], s[0:1]
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[8:11], s[0:1] offset:16
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[12:15], s[0:1] offset:224
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[16:19], s[0:1] offset:240
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[20:23], s[0:1] offset:192
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[24:27], s[0:1] offset:208
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[28:31], s[0:1] offset:160
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[32:35], s[0:1] offset:176
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[36:39], s[0:1] offset:128
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[40:43], s[0:1] offset:144
; GFX1251-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v32_vs:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0x124 nv
; GFX1251-GISEL-NEXT: s_load_b512 s[44:59], s[4:5], 0x164 nv
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v72, 8, v0
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: s_clause 0xf
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v72, s[0:1]
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v72, s[0:1] offset:16
; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v72, s[0:1] offset:32
; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v72, s[0:1] offset:48
; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v72, s[0:1] offset:64
; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v72, s[0:1] offset:80
; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v72, s[0:1] offset:96
; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v72, s[0:1] offset:112
; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v72, s[0:1] offset:128
; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v72, s[0:1] offset:144
; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v72, s[0:1] offset:160
; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v72, s[0:1] offset:176
; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v72, s[0:1] offset:192
; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v72, s[0:1] offset:208
; GFX1251-GISEL-NEXT: global_load_b128 v[56:59], v72, s[0:1] offset:224
; GFX1251-GISEL-NEXT: global_load_b128 v[60:63], v72, s[0:1] offset:240
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21]
; GFX1251-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x1a4 nv
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xf
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[26:27]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xe
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[4:7], v[4:7], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[30:31]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[24:25]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[28:29]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[8:11], v[8:11], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[12:15], v[12:15], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49]
; GFX1251-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0x1e4 nv
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[16:19], v[16:19], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[54:55]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[20:23], v[20:23], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[58:59]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[52:53]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[56:57]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[24:27], v[24:27], v[64:67]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[10:11]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[28:31], v[28:31], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[14:15]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[12:13]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[32:35], v[32:35], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[36:39], v[36:39], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[40:43], v[40:43], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[38:39]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[44:47], v[44:47], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[42:43]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[36:37]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[40:41]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[48:51], v[48:51], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[52:55], v[52:55], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[56:59], v[56:59], v[64:67]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[60:63], v[60:63], v[68:71]
; GFX1251-GISEL-NEXT: s_clause 0xf
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[4:7], s[0:1] offset:16
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[8:11], s[0:1] offset:32
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[12:15], s[0:1] offset:48
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[16:19], s[0:1] offset:64
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[20:23], s[0:1] offset:80
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[24:27], s[0:1] offset:96
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[28:31], s[0:1] offset:112
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[32:35], s[0:1] offset:128
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[36:39], s[0:1] offset:144
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[40:43], s[0:1] offset:160
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[44:47], s[0:1] offset:176
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[48:51], s[0:1] offset:192
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[52:55], s[0:1] offset:208
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[56:59], s[0:1] offset:224
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[60:63], s[0:1] offset:240
; GFX1251-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <32 x i64>, ptr addrspace(1) %gep, align 128
%add = add <32 x i64> %load, %x
store <32 x i64> %add, ptr addrspace(1) %gep, align 128
ret void
}
define amdgpu_kernel void @add_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_imm:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_imm:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x64
; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, <i64 100, i64 100>
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_v_v_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_v_splat:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[2:5], v[4:7], v[0:3]
; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_v_splat:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[2:5], v[4:7], v[0:3]
; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%id.1 = zext i32 %id to i64
%fid = bitcast i64 %id.1 to i64
%tmp1 = insertelement <2 x i64> poison, i64 %fid, i64 0
%k = insertelement <2 x i64> %tmp1, i64 %fid, i64 1
%add = add <2 x i64> %load, %k
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
; TODO: splat literal can be folded, but it is a REG_SEQUENCE which we do not match
define amdgpu_kernel void @add_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_lit_splat:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_lit_splat:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, <i64 1, i64 1>
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_lit_hi0:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_lit_hi0:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, <i64 1, i64 0>
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_v_lit_lo0(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_lit_lo0:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v6, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v7, v4
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_lit_lo0:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 1
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, <i64 0, i64 1>
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @add_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: add_v2_v_unfoldable_lit:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: add_v2_v_unfoldable_lit:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 2
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_add_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%add = add <2 x i64> %load, <i64 1, i64 2>
store <2 x i64> %add, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_vv(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; GFX1251-LABEL: sub_v2_vv:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv
; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: s_clause 0x1
; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset
; GFX1251-NEXT: s_wait_loadcnt 0x0
; GFX1251-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load.a = load <2 x i64>, ptr addrspace(1) %gep.a, align 8
%gep.b = getelementptr inbounds <2 x i64>, ptr addrspace(1) %b, i32 %id
%load.b = load <2 x i64>, ptr addrspace(1) %gep.b, align 8
%sub = sub <2 x i64> %load.a, %load.b
store <2 x i64> %sub, ptr addrspace(1) %gep.a, align 8
ret void
}
define amdgpu_kernel void @sub_v2_vs(ptr addrspace(1) %a, <2 x i64> %x) {
; GFX1251-LABEL: sub_v2_vs:
; GFX1251: ; %bb.0:
; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-NEXT: s_clause 0x1
; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-NEXT: s_wait_kmcnt 0x0
; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-NEXT: s_wait_loadcnt 0x0
; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, %x
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_ss(ptr addrspace(1) %a, <2 x i64> %x, <2 x i64> %y) {
; GFX1251-SDAG-LABEL: sub_v2_ss:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s8
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, s15
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_ss:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: s_endpgm
%sub = sub <2 x i64> %x, %y
store <2 x i64> %sub, ptr addrspace(1) %a, align 8
ret void
}
define amdgpu_kernel void @sub_v4_vs(ptr addrspace(1) %a, <4 x i64> %x) {
; GFX1251-SDAG-LABEL: sub_v4_vs:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v12, s8
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v14, s10
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v11, s15
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[12:15]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[4:7], v[4:7], v[8:11]
; GFX1251-SDAG-NEXT: s_clause 0x1
; GFX1251-SDAG-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1251-SDAG-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v4_vs:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v16, 5, v0
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v16, s[0:1]
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[8:11]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[4:7], v[4:7], v[12:15]
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <4 x i64>, ptr addrspace(1) %gep, align 16
%sub = sub <4 x i64> %load, %x
store <4 x i64> %sub, ptr addrspace(1) %gep, align 16
ret void
}
define amdgpu_kernel void @sub_v32_vs(ptr addrspace(1) %a, <32 x i64> %x) {
; GFX1251-SDAG-LABEL: sub_v32_vs:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_clause 0x3
; GFX1251-SDAG-NEXT: s_load_b512 s[68:83], s[4:5], 0x1a4 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[52:67], s[4:5], 0x1e4 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x124 nv
; GFX1251-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0x164 nv
; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v44, 8, v0
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: s_clause 0xf
; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v44, s[0:1] offset:144
; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v44, s[0:1] offset:128
; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v44, s[0:1] offset:176
; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v44, s[0:1] offset:160
; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v44, s[0:1] offset:208
; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v44, s[0:1] offset:192
; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v44, s[0:1] offset:240
; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v44, s[0:1] offset:224
; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v44, s[0:1] offset:16
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v44, s[0:1]
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v44, s[0:1] offset:48
; GFX1251-SDAG-NEXT: global_load_b128 v[46:49], v44, s[0:1] offset:64
; GFX1251-SDAG-NEXT: global_load_b128 v[50:53], v44, s[0:1] offset:112
; GFX1251-SDAG-NEXT: global_load_b128 v[54:57], v44, s[0:1] offset:96
; GFX1251-SDAG-NEXT: global_load_b128 v[58:61], v44, s[0:1] offset:80
; GFX1251-SDAG-NEXT: global_load_b128 v[62:65], v44, s[0:1] offset:32
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s72 :: v_dual_mov_b32 v67, s73
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s74 :: v_dual_mov_b32 v69, s75
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s68 :: v_dual_mov_b32 v71, s69
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s70 :: v_dual_mov_b32 v73, s71
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xf
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[40:43], v[40:43], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s80 :: v_dual_mov_b32 v67, s81
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s82 :: v_dual_mov_b32 v69, s83
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xe
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[36:39], v[36:39], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s76 :: v_dual_mov_b32 v71, s77
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s78 :: v_dual_mov_b32 v73, s79
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[32:35], v[32:35], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s56 :: v_dual_mov_b32 v67, s57
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s58 :: v_dual_mov_b32 v69, s59
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[28:31], v[28:31], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s52 :: v_dual_mov_b32 v71, s53
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s54 :: v_dual_mov_b32 v73, s55
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[24:27], v[24:27], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s64 :: v_dual_mov_b32 v67, s65
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s66 :: v_dual_mov_b32 v69, s67
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[20:23], v[20:23], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s60 :: v_dual_mov_b32 v71, s61
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s62 :: v_dual_mov_b32 v73, s63
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[16:19], v[16:19], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s13
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s14 :: v_dual_mov_b32 v69, s15
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[12:15], v[12:15], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s9
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s10 :: v_dual_mov_b32 v73, s11
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x7
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[8:11], v[8:11], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s20 :: v_dual_mov_b32 v67, s21
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s22 :: v_dual_mov_b32 v69, s23
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x6
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[4:7], v[4:7], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s36 :: v_dual_mov_b32 v71, s37
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s38 :: v_dual_mov_b32 v73, s39
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s48 :: v_dual_mov_b32 v67, s49
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s50 :: v_dual_mov_b32 v69, s51
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[46:49], v[46:49], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s44 :: v_dual_mov_b32 v71, s45
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s46 :: v_dual_mov_b32 v73, s47
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[50:53], v[50:53], v[66:69]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s40 :: v_dual_mov_b32 v67, s41
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s42 :: v_dual_mov_b32 v69, s43
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[54:57], v[54:57], v[70:73]
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s16 :: v_dual_mov_b32 v71, s17
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s18 :: v_dual_mov_b32 v73, s19
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[58:61], v[58:61], v[66:69]
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[62:65], v[62:65], v[70:73]
; GFX1251-SDAG-NEXT: s_clause 0xf
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[54:57], s[0:1] offset:96
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[50:53], s[0:1] offset:112
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[46:49], s[0:1] offset:64
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[58:61], s[0:1] offset:80
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[62:65], s[0:1] offset:32
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[0:3], s[0:1] offset:48
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[4:7], s[0:1]
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[8:11], s[0:1] offset:16
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[12:15], s[0:1] offset:224
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[16:19], s[0:1] offset:240
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[20:23], s[0:1] offset:192
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[24:27], s[0:1] offset:208
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[28:31], s[0:1] offset:160
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[32:35], s[0:1] offset:176
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[36:39], s[0:1] offset:128
; GFX1251-SDAG-NEXT: global_store_b128 v44, v[40:43], s[0:1] offset:144
; GFX1251-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v32_vs:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_clause 0x1
; GFX1251-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0x124 nv
; GFX1251-GISEL-NEXT: s_load_b512 s[44:59], s[4:5], 0x164 nv
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v72, 8, v0
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: s_clause 0xf
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v72, s[0:1]
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v72, s[0:1] offset:16
; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v72, s[0:1] offset:32
; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v72, s[0:1] offset:48
; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v72, s[0:1] offset:64
; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v72, s[0:1] offset:80
; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v72, s[0:1] offset:96
; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v72, s[0:1] offset:112
; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v72, s[0:1] offset:128
; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v72, s[0:1] offset:144
; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v72, s[0:1] offset:160
; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v72, s[0:1] offset:176
; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v72, s[0:1] offset:192
; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v72, s[0:1] offset:208
; GFX1251-GISEL-NEXT: global_load_b128 v[56:59], v72, s[0:1] offset:224
; GFX1251-GISEL-NEXT: global_load_b128 v[60:63], v72, s[0:1] offset:240
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21]
; GFX1251-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x1a4 nv
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xf
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[26:27]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xe
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[4:7], v[4:7], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[30:31]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[24:25]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[28:29]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[8:11], v[8:11], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[12:15], v[12:15], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49]
; GFX1251-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0x1e4 nv
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[16:19], v[16:19], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[54:55]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[20:23], v[20:23], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[58:59]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[52:53]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[56:57]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[24:27], v[24:27], v[64:67]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[10:11]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[28:31], v[28:31], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[14:15]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[8:9]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[12:13]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[32:35], v[32:35], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[36:39], v[36:39], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[40:43], v[40:43], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[38:39]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[44:47], v[44:47], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[42:43]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[36:37]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[40:41]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[48:51], v[48:51], v[64:67]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[52:55], v[52:55], v[68:71]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[56:59], v[56:59], v[64:67]
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[60:63], v[60:63], v[68:71]
; GFX1251-GISEL-NEXT: s_clause 0xf
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[0:3], s[0:1]
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[4:7], s[0:1] offset:16
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[8:11], s[0:1] offset:32
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[12:15], s[0:1] offset:48
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[16:19], s[0:1] offset:64
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[20:23], s[0:1] offset:80
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[24:27], s[0:1] offset:96
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[28:31], s[0:1] offset:112
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[32:35], s[0:1] offset:128
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[36:39], s[0:1] offset:144
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[40:43], s[0:1] offset:160
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[44:47], s[0:1] offset:176
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[48:51], s[0:1] offset:192
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[52:55], s[0:1] offset:208
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[56:59], s[0:1] offset:224
; GFX1251-GISEL-NEXT: global_store_b128 v72, v[60:63], s[0:1] offset:240
; GFX1251-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <32 x i64>, ptr addrspace(1) %gep, align 128
%sub = sub <32 x i64> %load, %x
store <32 x i64> %sub, ptr addrspace(1) %gep, align 128
ret void
}
define amdgpu_kernel void @sub_v2_v_imm(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_imm:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_imm:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x64
; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, <i64 100, i64 100>
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_imm_v(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_imm_v:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0x64 :: v_dual_mov_b32 v5, 0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_imm_v:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x64
; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[4:7], v[0:3]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> <i64 100, i64 100>, %load
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_v_v_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_v_splat:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0
; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[2:5], v[4:7], v[0:3]
; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_v_splat:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[2:5], v[4:7], v[0:3]
; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%id.1 = zext i32 %id to i64
%fid = bitcast i64 %id.1 to i64
%tmp1 = insertelement <2 x i64> poison, i64 %fid, i64 0
%k = insertelement <2 x i64> %tmp1, i64 %fid, i64 1
%sub = sub <2 x i64> %load, %k
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
; TODO: splat literal can be folded, but it is a REG_SEQUENCE which we do not match
define amdgpu_kernel void @sub_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_lit_splat:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 0
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_lit_splat:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, <i64 1, i64 1>
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_lit_hi0:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_lit_hi0:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 0
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, <i64 1, i64 0>
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_v_lit_lo0(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_lit_lo0:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v6, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v7, v4
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_lit_lo0:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 1
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, <i64 0, i64 1>
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}
define amdgpu_kernel void @sub_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1251-SDAG-LABEL: sub_v2_v_unfoldable_lit:
; GFX1251-SDAG: ; %bb.0:
; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv
; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 1
; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, v5
; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1251-SDAG-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset
; GFX1251-SDAG-NEXT: s_endpgm
;
; GFX1251-GISEL-LABEL: sub_v2_v_unfoldable_lit:
; GFX1251-GISEL: ; %bb.0:
; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv
; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 2
; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1251-GISEL-NEXT: v_pk_sub_nc_u64 v[0:3], v[0:3], v[4:7]
; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1251-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %a, i32 %id
%load = load <2 x i64>, ptr addrspace(1) %gep, align 8
%sub = sub <2 x i64> %load, <i64 1, i64 2>
store <2 x i64> %sub, ptr addrspace(1) %gep, align 8
ret void
}