| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s |
| |
| define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { |
| ; GFX8-OPT-LABEL: dpp_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 |
| store i32 %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_bc: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_bc: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_bc: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_bc: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0 |
| store i32 %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| |
| @0 = internal unnamed_addr addrspace(3) global [448 x i32] poison, align 4 |
| define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { |
| ; GFX8-OPT-LABEL: dpp_test1: |
| ; GFX8-OPT: ; %bb.0: ; %bb |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX8-OPT-NEXT: s_mov_b32 m0, -1 |
| ; GFX8-OPT-NEXT: ds_read_b32 v1, v0 |
| ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: s_barrier |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1 |
| ; GFX8-OPT-NEXT: s_nop 1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test1: |
| ; GFX8-NOOPT: ; %bb.0: ; %bb |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 2 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 |
| ; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_barrier |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v3 |
| ; GFX8-NOOPT-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test1: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: ds_read_b32 v1, v0 |
| ; GFX10-NEXT: s_barrier |
| ; GFX10-NEXT: buffer_gl0_inv |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf |
| ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1 |
| ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 |
| ; GFX10-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test1: |
| ; GFX11: ; %bb.0: ; %bb |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 |
| ; GFX11-NEXT: ds_load_b32 v1, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_barrier |
| ; GFX11-NEXT: buffer_gl0_inv |
| ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 |
| ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf |
| ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1 |
| ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 |
| ; GFX11-NEXT: flat_store_b32 v[0:1], v2 |
| ; GFX11-NEXT: s_endpgm |
| bb: |
| %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %tmp1 = zext i32 %tmp to i64 |
| %tmp2 = getelementptr inbounds [448 x i32], ptr addrspace(3) @0, i32 0, i32 %tmp |
| %tmp3 = load i32, ptr addrspace(3) %tmp2, align 4 |
| fence syncscope("workgroup-one-as") release |
| tail call void @llvm.amdgcn.s.barrier() |
| fence syncscope("workgroup-one-as") acquire |
| %tmp4 = add nsw i32 %tmp3, %tmp3 |
| %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false) |
| %tmp6 = add nsw i32 %tmp5, %tmp4 |
| %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp1 |
| store i32 %tmp6, ptr %tmp7, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { |
| ; GFX8-OPT-LABEL: update_dppi64_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppi64_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppi64_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppi64_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id |
| %load = load i64, ptr addrspace(1) %gep |
| %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store i64 %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { |
| ; GFX8-OPT-LABEL: update_dppf64_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppf64_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppf64_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppf64_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id |
| %load = load double, ptr addrspace(1) %gep |
| %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store double %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { |
| ; GFX8-OPT-LABEL: update_dppv2i32_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppv2i32_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppv2i32_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppv2i32_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id |
| %load = load <2 x i32>, ptr addrspace(1) %gep |
| %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store <2 x i32> %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { |
| ; GFX8-OPT-LABEL: update_dppv2f32_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppv2f32_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppv2f32_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppv2f32_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id |
| %load = load <2 x float>, ptr addrspace(1) %gep |
| %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store <2 x float> %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { |
| ; GFX8-OPT-LABEL: update_dpp_p0_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, s3 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dpp_p0_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dpp_p0_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dpp_p0_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id |
| %load = load ptr, ptr addrspace(1) %gep |
| %tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store ptr %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { |
| ; GFX8-OPT-LABEL: update_dpp_p3_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX8-OPT-NEXT: s_mov_b32 m0, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: ds_read_b32 v1, v0 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: s_nop 0 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: ds_write_b32 v0, v2 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dpp_p3_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[2:3], s1, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 |
| ; GFX8-NOOPT-NEXT: ds_read_b32 v2, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_nop 0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1 |
| ; GFX8-NOOPT-NEXT: ds_write_b32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dpp_p3_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX10-NEXT: ds_read_b32 v1, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: ds_write_b32 v0, v2 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dpp_p3_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX11-NEXT: ds_load_b32 v1, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: ds_store_b32 v0, v2 |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id |
| %load = load ptr addrspace(3), ptr addrspace(3) %gep |
| %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) { |
| ; GFX8-OPT-LABEL: update_dpp_p5_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 |
| ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 |
| ; GFX8-OPT-NEXT: s_mov_b32 s90, -1 |
| ; GFX8-OPT-NEXT: s_mov_b32 s91, 0xe80000 |
| ; GFX8-OPT-NEXT: s_add_u32 s88, s88, s11 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX8-OPT-NEXT: s_addc_u32 s89, s89, 0 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: s_nop 0 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dpp_p5_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s90, -1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s91, 0xe80000 |
| ; GFX8-NOOPT-NEXT: s_add_u32 s88, s88, s11 |
| ; GFX8-NOOPT-NEXT: s_addc_u32 s89, s89, 0 |
| ; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[2:3], s1, v0 |
| ; GFX8-NOOPT-NEXT: buffer_load_dword v2, v1, s[88:91], 0 offen |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_nop 0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dpp_p5_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 |
| ; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 |
| ; GFX10-NEXT: s_mov_b32 s14, -1 |
| ; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 |
| ; GFX10-NEXT: s_add_u32 s12, s12, s11 |
| ; GFX10-NEXT: s_addc_u32 s13, s13, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dpp_p5_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX11-NEXT: scratch_load_b32 v1, v0, off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: scratch_store_b32 v0, v2, off |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id |
| %load = load ptr addrspace(5), ptr addrspace(5) %gep |
| %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) { |
| ; GFX8-OPT-LABEL: update_dppi64_imm_old_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, 0x7047 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, 0x3afaedd9 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: s_nop 0 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppi64_imm_old_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0x7047 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0x3afaedd9 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s2 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppi64_imm_old_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7047 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppi64_imm_old_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v3, 0x7047 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id |
| %load = load i64, ptr addrspace(1) %gep |
| %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store i64 %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) { |
| ; GFX8-OPT-LABEL: update_dppf64_imm_old_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v5, 0x405edce1 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v4, 0x6b8564a |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; GFX8-OPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-OPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-OPT-NEXT: s_nop 0 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppf64_imm_old_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 |
| ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec |
| ; GFX8-NOOPT-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NOOPT-NEXT: v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1] |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0x405edce1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0x6b8564a |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s2 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s1 |
| ; GFX8-NOOPT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v5, v3 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, s2 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v2 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4 |
| ; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3] |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppf64_imm_old_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x405edce1 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x6b8564a |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppf64_imm_old_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v3, 0x405edce1 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0x6b8564a |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id |
| %load = load double, ptr addrspace(1) %gep |
| %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double 123.4512345123450, double %load, i32 1, i32 1, i32 1, i1 false) #0 |
| store double %tmp0, ptr addrspace(1) %gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 %in1) { |
| ; GFX8-OPT-LABEL: update_dppi64_imm_src_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, 0x7047 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0x3afaedd9 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: s_nop 1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppi64_imm_src_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s4 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, 0x7047 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0x3afaedd9 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s9, s7 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s9 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s8 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppi64_imm_src_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7047 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x3afaedd9 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppi64_imm_src_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0x7047 |
| ; GFX11-NEXT: v_mov_b32_e32 v3, 0x3afaedd9 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0 |
| store i64 %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, double %in1) { |
| ; GFX8-OPT-LABEL: update_dppf64_imm_src_test: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, 0x405edce1 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0x6b8564a |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: s_nop 1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: update_dppf64_imm_src_test: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s4 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, 0x405edce1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s4, 0x6b8564a |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s9, s7 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s9 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s8 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s5, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 |
| ; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4 |
| ; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2 |
| ; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: update_dppf64_imm_src_test: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x405edce1 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x6b8564a |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: update_dppf64_imm_src_test: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0x405edce1 |
| ; GFX11-NEXT: v_mov_b32_e32 v3, 0x6b8564a |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0 |
| store double %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb1: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb1: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb2: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb2: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb2: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb2: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb3: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb3: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb3: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb3: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb4: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb4: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb4: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb4: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb5: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb5: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb5: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb5: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb6: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb6: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb6: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb6: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb7: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb7: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb7: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb7: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_f32_imm_comb8: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb8: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_f32_imm_comb8: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_f32_imm_comb8: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true) |
| store float %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb1: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb1: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb2: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb2: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb2: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb2: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb3: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb3: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb3: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb3: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb4: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb4: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb4: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb4: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb5: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb5: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb5: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb5: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb6: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb6: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb6: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb6: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb7: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb7: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb7: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb7: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb8: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb8: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2i16_imm_comb8: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2i16_imm_comb8: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true) |
| store <2 x i16> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb1: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb1: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb2: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb2: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb2: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb2: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb3: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb3: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb3: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb3: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb4: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb4: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb4: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb4: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb5: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb5: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb5: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb5: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb6: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb6: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb6: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb6: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb7: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb7: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb7: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb7: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) { |
| ; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb8: |
| ; GFX8-OPT: ; %bb.0: |
| ; GFX8-OPT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX8-OPT-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX8-OPT-NEXT: s_mov_b32 s6, -1 |
| ; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-OPT-NEXT: s_mov_b32 s4, s0 |
| ; GFX8-OPT-NEXT: s_mov_b32 s5, s1 |
| ; GFX8-OPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-OPT-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; GFX8-OPT-NEXT: s_endpgm |
| ; |
| ; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb8: |
| ; GFX8-NOOPT: ; %bb.0: |
| ; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] |
| ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 |
| ; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c |
| ; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30 |
| ; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s6, 0xf000 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s7, -1 |
| ; GFX8-NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s1, s8 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s2, s7 |
| ; GFX8-NOOPT-NEXT: s_mov_b32 s3, s6 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NOOPT-NEXT: s_nop 1 |
| ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX8-NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX8-NOOPT-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: dpp_test_v2f16_imm_comb8: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX10-NEXT: s_mov_b32 s2, -1 |
| ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: dpp_test_v2f16_imm_comb8: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 |
| ; GFX11-NEXT: s_mov_b32 s2, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1 |
| ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 |
| ; GFX11-NEXT: s_endpgm |
| %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true) |
| store <2 x half> %tmp0, ptr addrspace(1) %out |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() |
| declare void @llvm.amdgcn.s.barrier() |
| declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 |
| declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0 |
| declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0 |
| declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0 |
| declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0 |
| |
| attributes #0 = { nounwind readnone convergent } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; GCN: {{.*}} |
| ; GCN-OPT: {{.*}} |
| ; GFX8: {{.*}} |