; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s

define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-OPT-LABEL: dpp_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
  store i32 %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-OPT-LABEL: dpp_test_bc:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_bc:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_bc:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_bc:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
  store i32 %tmp0, ptr addrspace(1) %out
  ret void
}


@0 = internal unnamed_addr addrspace(3) global [448 x i32] poison, align 4
define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX8-OPT-LABEL: dpp_test1:
; GFX8-OPT:       ; %bb.0: ; %bb
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX8-OPT-NEXT:    s_mov_b32 m0, -1
; GFX8-OPT-NEXT:    ds_read_b32 v1, v0
; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    s_barrier
; GFX8-OPT-NEXT:    v_add_u32_e32 v1, vcc, v1, v1
; GFX8-OPT-NEXT:    s_nop 1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_store_dword v[0:1], v2
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test1:
; GFX8-NOOPT:       ; %bb.0: ; %bb
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 2
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 m0, -1
; GFX8-NOOPT-NEXT:    ds_read_b32 v0, v3
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_barrier
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v1, s[0:1], v0, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v2, s[0:1], v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, s2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s3
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s2
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v3
; GFX8-NOOPT-NEXT:    flat_store_dword v[0:1], v2
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test1:
; GFX10:       ; %bb.0: ; %bb
; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    ds_read_b32 v1, v0
; GFX10-NEXT:    s_barrier
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v1
; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v1
; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-NEXT:    flat_store_dword v[0:1], v2
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test1:
; GFX11:       ; %bb.0: ; %bb
; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v2, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT:    ds_load_b32 v1, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_barrier
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT:    flat_store_b32 v[0:1], v2
; GFX11-NEXT:    s_endpgm
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds [448 x i32], ptr addrspace(3) @0, i32 0, i32 %tmp
  %tmp3 = load i32, ptr addrspace(3) %tmp2, align 4
  fence syncscope("workgroup-one-as") release
  tail call void @llvm.amdgcn.s.barrier()
  fence syncscope("workgroup-one-as") acquire
  %tmp4 = add nsw i32 %tmp3, %tmp3
  %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
  %tmp6 = add nsw i32 %tmp5, %tmp4
  %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
  store i32 %tmp6, ptr %tmp7, align 4
  ret void
}

define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
; GFX8-OPT-LABEL: update_dppi64_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppi64_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[0:1], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s4
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, s5
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppi64_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v3, s3
; GFX10-NEXT:    v_mov_b32_e32 v2, s2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v3, s3
; GFX11-NEXT:    v_mov_b32_e32 v2, s2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
  %load = load i64, ptr addrspace(1) %gep
  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
  store i64 %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
; GFX8-OPT-LABEL: update_dppf64_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppf64_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[0:1], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s4
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, s5
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppf64_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v3, s3
; GFX10-NEXT:    v_mov_b32_e32 v2, s2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v3, s3
; GFX11-NEXT:    v_mov_b32_e32 v2, s2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
  %load = load double, ptr addrspace(1) %gep
  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #0
  store double %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
; GFX8-OPT-LABEL: update_dppv2i32_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppv2i32_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[0:1], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s4
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, s5
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppv2i32_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v3, s3
; GFX10-NEXT:    v_mov_b32_e32 v2, s2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppv2i32_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v3, s3
; GFX11-NEXT:    v_mov_b32_e32 v2, s2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
  %load = load <2 x i32>, ptr addrspace(1) %gep
  %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0
  store <2 x i32> %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
; GFX8-OPT-LABEL: update_dppv2f32_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppv2f32_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[0:1], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s4
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, s5
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppv2f32_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v3, s3
; GFX10-NEXT:    v_mov_b32_e32 v2, s2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppv2f32_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v3, s3
; GFX11-NEXT:    v_mov_b32_e32 v2, s2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
  %load = load <2 x float>, ptr addrspace(1) %gep
  %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0
  store <2 x float> %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-OPT-LABEL: update_dpp_p0_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dpp_p0_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[0:1], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s4
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, s5
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v3, s3
; GFX10-NEXT:    v_mov_b32_e32 v2, s2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v3, s3
; GFX11-NEXT:    v_mov_b32_e32 v2, s2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
  %load = load ptr, ptr addrspace(1) %gep
  %tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0
  store ptr %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
; GFX8-OPT-LABEL: update_dpp_p3_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX8-OPT-NEXT:    s_mov_b32 m0, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    ds_read_b32 v1, v0
; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, s1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    s_nop 0
; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    ds_write_b32 v0, v2
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dpp_p3_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_load_dword s1, s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s0, s[4:5], 0x28
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 2
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[2:3], s1, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 m0, -1
; GFX8-NOOPT-NEXT:    ds_read_b32 v2, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_nop 0
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    s_mov_b32 m0, -1
; GFX8-NOOPT-NEXT:    ds_write_b32 v0, v1
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
; GFX10-NEXT:    v_mov_b32_e32 v2, s1
; GFX10-NEXT:    ds_read_b32 v1, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    ds_write_b32 v0, v2
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dpp_p3_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
; GFX11-NEXT:    v_mov_b32_e32 v2, s1
; GFX11-NEXT:    ds_load_b32 v1, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    ds_store_b32 v0, v2
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
  %load = load ptr addrspace(3), ptr addrspace(3) %gep
  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #0
  store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep
  ret void
}

define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) {
; GFX8-OPT-LABEL: update_dpp_p5_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-OPT-NEXT:    s_mov_b32 s90, -1
; GFX8-OPT-NEXT:    s_mov_b32 s91, 0xe80000
; GFX8-OPT-NEXT:    s_add_u32 s88, s88, s11
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
; GFX8-OPT-NEXT:    s_addc_u32 s89, s89, 0
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    buffer_load_dword v1, v0, s[88:91], 0 offen
; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, s1
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    s_nop 0
; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dpp_p5_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; GFX8-NOOPT-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-NOOPT-NEXT:    s_mov_b32 s90, -1
; GFX8-NOOPT-NEXT:    s_mov_b32 s91, 0xe80000
; GFX8-NOOPT-NEXT:    s_add_u32 s88, s88, s11
; GFX8-NOOPT-NEXT:    s_addc_u32 s89, s89, 0
; GFX8-NOOPT-NEXT:    s_load_dword s1, s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s0, s[4:5], 0x28
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 2
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v1, s[2:3], s1, v0
; GFX8-NOOPT-NEXT:    buffer_load_dword v2, v1, s[88:91], 0 offen
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s0
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    s_nop 0
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s14, -1
; GFX10-NEXT:    s_mov_b32 s15, 0x31c16000
; GFX10-NEXT:    s_add_u32 s12, s12, s11
; GFX10-NEXT:    s_addc_u32 s13, s13, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
; GFX10-NEXT:    v_mov_b32_e32 v2, s1
; GFX10-NEXT:    buffer_load_dword v1, v0, s[12:15], 0 offen
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dpp_p5_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
; GFX11-NEXT:    v_mov_b32_e32 v2, s1
; GFX11-NEXT:    scratch_load_b32 v1, v0, off
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    scratch_store_b32 v0, v2, off
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
  %load = load ptr addrspace(5), ptr addrspace(5) %gep
  %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #0
  store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep
  ret void
}

define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
; GFX8-OPT-LABEL: update_dppi64_imm_old_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, 0x7047
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, 0x3afaedd9
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    s_nop 0
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppi64_imm_old_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s0, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, s2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s3
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s2
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0x7047
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 0x3afaedd9
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s2
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppi64_imm_old_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7047
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3afaedd9
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppi64_imm_old_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x7047
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x3afaedd9
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
  %load = load i64, ptr addrspace(1) %gep
  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
  store i64 %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) {
; GFX8-OPT-LABEL: update_dppf64_imm_old_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
; GFX8-OPT-NEXT:    v_mov_b32_e32 v5, 0x405edce1
; GFX8-OPT-NEXT:    v_mov_b32_e32 v4, 0x6b8564a
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-OPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-OPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-OPT-NEXT:    s_nop 0
; GFX8-OPT-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppf64_imm_old_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 3
; GFX8-NOOPT-NEXT:    v_lshlrev_b32_e64 v1, s0, v0
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, v0
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, s2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, v1
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s3
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s2
; GFX8-NOOPT-NEXT:    v_addc_u32_e64 v2, s[0:1], v1, v2, s[0:1]
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, 0x405edce1
; GFX8-NOOPT-NEXT:    s_mov_b32 s0, 0x6b8564a
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s2
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s1
; GFX8-NOOPT-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v5, v3
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v4, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v2
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppf64_imm_old_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x405edce1
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x6b8564a
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppf64_imm_old_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x405edce1
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x6b8564a
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
; GFX11-NEXT:    s_endpgm
  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
  %load = load double, ptr addrspace(1) %gep
  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double 123.4512345123450, double %load, i32 1, i32 1, i32 1, i1 false) #0
  store double %tmp0, ptr addrspace(1) %gep
  ret void
}

define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
; GFX8-OPT-LABEL: update_dppi64_imm_src_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, 0x7047
; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0x3afaedd9
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    s_nop 1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppi64_imm_src_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s5
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s4
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, 0x7047
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, 0x3afaedd9
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s5
; GFX8-NOOPT-NEXT:    s_mov_b32 s9, s7
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s9
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s8
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppi64_imm_src_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7047
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3afaedd9
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppi64_imm_src_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x7047
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x3afaedd9
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
  store i64 %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, double %in1) {
; GFX8-OPT-LABEL: update_dppf64_imm_src_test:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, 0x405edce1
; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0x6b8564a
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v1, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    s_nop 1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: update_dppf64_imm_src_test:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s5
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s4
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, 0x405edce1
; GFX8-NOOPT-NEXT:    s_mov_b32 s4, 0x6b8564a
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s5
; GFX8-NOOPT-NEXT:    s_mov_b32 s9, s7
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v2, s9
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s8
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX8-NOOPT-NEXT:    s_mov_b32 s5, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT:    ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: update_dppf64_imm_src_test:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x405edce1
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x6b8564a
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: update_dppf64_imm_src_test:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x405edce1
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x6b8564a
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_mov_b32_dpp v1, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    v_mov_b32_dpp v0, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0
  store double %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb1:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb1:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb1:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb1:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb2:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb2:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb2:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb3:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb3:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb3:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb3:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb4:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb4:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb4:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb4:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb5:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb5:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb5:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb5:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb6:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb6:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb6:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb6:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}


define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb7:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb7:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb7:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb7:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
; GFX8-OPT-LABEL: dpp_test_f32_imm_comb8:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_f32_imm_comb8:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_f32_imm_comb8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_f32_imm_comb8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true)
  store float %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb1:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb1:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb1:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb1:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb2:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb2:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb2:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb3:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb3:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb3:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb3:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb4:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb4:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb4:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb4:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb5:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb5:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb5:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb5:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb6:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb6:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb6:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb6:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb7:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb7:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb7:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb7:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2i16_imm_comb8:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2i16_imm_comb8:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2i16_imm_comb8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2i16_imm_comb8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true)
  store <2 x i16> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb1:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb1:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb1:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb1:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb2:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb2:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb2:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb3:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb3:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb3:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb3:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb4:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb4:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb4:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb4:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb5:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb5:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb5:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb5:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb6:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb6:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb6:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb6:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb7:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb7:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb7:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb7:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
; GFX8-OPT-LABEL: dpp_test_v2f16_imm_comb8:
; GFX8-OPT:       ; %bb.0:
; GFX8-OPT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-OPT-NEXT:    s_mov_b32 s7, 0xf000
; GFX8-OPT-NEXT:    s_mov_b32 s6, -1
; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-OPT-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-OPT-NEXT:    s_mov_b32 s4, s0
; GFX8-OPT-NEXT:    s_mov_b32 s5, s1
; GFX8-OPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-OPT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; GFX8-OPT-NEXT:    s_endpgm
;
; GFX8-NOOPT-LABEL: dpp_test_v2f16_imm_comb8:
; GFX8-NOOPT:       ; %bb.0:
; GFX8-NOOPT-NEXT:    s_mov_b64 s[2:3], s[4:5]
; GFX8-NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NOOPT-NEXT:    s_load_dword s5, s[2:3], 0x2c
; GFX8-NOOPT-NEXT:    s_load_dword s4, s[2:3], 0x30
; GFX8-NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NOOPT-NEXT:    s_mov_b32 s8, s1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; GFX8-NOOPT-NEXT:    s_mov_b32 s6, 0xf000
; GFX8-NOOPT-NEXT:    s_mov_b32 s7, -1
; GFX8-NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX8-NOOPT-NEXT:    s_mov_b32 s1, s8
; GFX8-NOOPT-NEXT:    s_mov_b32 s2, s7
; GFX8-NOOPT-NEXT:    s_mov_b32 s3, s6
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NOOPT-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT:    s_nop 1
; GFX8-NOOPT-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX8-NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX8-NOOPT-NEXT:    s_endpgm
;
; GFX10-LABEL: dpp_test_v2f16_imm_comb8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
; GFX10-NEXT:    s_mov_b32 s2, -1
; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: dpp_test_v2f16_imm_comb8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, s2
; GFX11-NEXT:    v_mov_b32_e32 v1, s3
; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
; GFX11-NEXT:    s_mov_b32 s2, -1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1
; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT:    s_endpgm
  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true)
  store <2 x half> %tmp0, ptr addrspace(1) %out
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0
declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
; GCN-OPT: {{.*}}
; GFX8: {{.*}}
