blob: beeeaa32cacfdd82016da880628e3e294e4238e5 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s
define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
; GCN-LABEL: float4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 3
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 2
; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float4_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: v_mov_b32_e32 v5, s7
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
store <4 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_inselt_undef:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float4_inselt_undef:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: v_mov_b32_e32 v5, s7
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x float> poison, float 1.000000e+00, i32 %sel
store <4 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
; GCN-LABEL: int4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 3
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 2
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 1
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: int4_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: v_mov_b32_e32 v5, s7
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x i32> %vec, i32 1, i32 %sel
store <4 x i32> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) {
; GCN-LABEL: float2_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float2_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x34
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
store <2 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) {
; GCN-LABEL: float8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s2, s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float8_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x64
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v14, s11
; GCN-O0-NEXT: v_mov_b32_e32 v13, s10
; GCN-O0-NEXT: v_mov_b32_e32 v12, s9
; GCN-O0-NEXT: v_mov_b32_e32 v11, s8
; GCN-O0-NEXT: v_mov_b32_e32 v10, s7
; GCN-O0-NEXT: v_mov_b32_e32 v9, s6
; GCN-O0-NEXT: v_mov_b32_e32 v8, s5
; GCN-O0-NEXT: v_mov_b32_e32 v7, s4
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v14
; GCN-O0-NEXT: v_mov_b32_e32 v1, v13
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: v_mov_b32_e32 v2, v11
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v1, v9
; GCN-O0-NEXT: v_mov_b32_e32 v6, v8
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
store <8 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) {
; GCN-LABEL: float16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_load_dword s4, s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v8, s16
; GCN-NEXT: v_mov_b32_e32 v9, s17
; GCN-NEXT: v_mov_b32_e32 v10, s18
; GCN-NEXT: v_mov_b32_e32 v11, s19
; GCN-NEXT: v_mov_b32_e32 v12, s20
; GCN-NEXT: v_mov_b32_e32 v13, s21
; GCN-NEXT: v_mov_b32_e32 v14, s22
; GCN-NEXT: v_mov_b32_e32 v15, s23
; GCN-NEXT: s_mov_b32 m0, s4
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v13, s3
; GCN-NEXT: v_mov_b32_e32 v12, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float16_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0xa4
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v22, s19
; GCN-O0-NEXT: v_mov_b32_e32 v21, s18
; GCN-O0-NEXT: v_mov_b32_e32 v20, s17
; GCN-O0-NEXT: v_mov_b32_e32 v19, s16
; GCN-O0-NEXT: v_mov_b32_e32 v18, s15
; GCN-O0-NEXT: v_mov_b32_e32 v17, s14
; GCN-O0-NEXT: v_mov_b32_e32 v16, s13
; GCN-O0-NEXT: v_mov_b32_e32 v15, s12
; GCN-O0-NEXT: v_mov_b32_e32 v14, s11
; GCN-O0-NEXT: v_mov_b32_e32 v13, s10
; GCN-O0-NEXT: v_mov_b32_e32 v12, s9
; GCN-O0-NEXT: v_mov_b32_e32 v11, s8
; GCN-O0-NEXT: v_mov_b32_e32 v10, s7
; GCN-O0-NEXT: v_mov_b32_e32 v9, s6
; GCN-O0-NEXT: v_mov_b32_e32 v8, s5
; GCN-O0-NEXT: v_mov_b32_e32 v7, s4
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v22
; GCN-O0-NEXT: v_mov_b32_e32 v1, v21
; GCN-O0-NEXT: v_mov_b32_e32 v6, v20
; GCN-O0-NEXT: v_mov_b32_e32 v2, v19
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v18
; GCN-O0-NEXT: v_mov_b32_e32 v1, v17
; GCN-O0-NEXT: v_mov_b32_e32 v6, v16
; GCN-O0-NEXT: v_mov_b32_e32 v2, v15
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 32
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v14
; GCN-O0-NEXT: v_mov_b32_e32 v1, v13
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: v_mov_b32_e32 v2, v11
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v1, v9
; GCN-O0-NEXT: v_mov_b32_e32 v6, v8
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
store <16 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) {
; GCN-LABEL: float32_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[4:5], 0x124
; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x70
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v33, s3
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
; GCN-NEXT: v_mov_b32_e32 v5, s41
; GCN-NEXT: v_mov_b32_e32 v6, s42
; GCN-NEXT: v_mov_b32_e32 v7, s43
; GCN-NEXT: v_mov_b32_e32 v8, s44
; GCN-NEXT: v_mov_b32_e32 v9, s45
; GCN-NEXT: v_mov_b32_e32 v10, s46
; GCN-NEXT: v_mov_b32_e32 v11, s47
; GCN-NEXT: v_mov_b32_e32 v12, s48
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
; GCN-NEXT: v_mov_b32_e32 v16, s8
; GCN-NEXT: v_mov_b32_e32 v17, s9
; GCN-NEXT: v_mov_b32_e32 v18, s10
; GCN-NEXT: v_mov_b32_e32 v19, s11
; GCN-NEXT: v_mov_b32_e32 v20, s12
; GCN-NEXT: v_mov_b32_e32 v21, s13
; GCN-NEXT: v_mov_b32_e32 v22, s14
; GCN-NEXT: v_mov_b32_e32 v23, s15
; GCN-NEXT: v_mov_b32_e32 v24, s16
; GCN-NEXT: v_mov_b32_e32 v25, s17
; GCN-NEXT: v_mov_b32_e32 v26, s18
; GCN-NEXT: v_mov_b32_e32 v27, s19
; GCN-NEXT: v_mov_b32_e32 v28, s20
; GCN-NEXT: v_mov_b32_e32 v29, s21
; GCN-NEXT: v_mov_b32_e32 v30, s22
; GCN-NEXT: v_mov_b32_e32 v31, s23
; GCN-NEXT: v_mov_b32_e32 v32, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x60
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v29, s3
; GCN-NEXT: v_mov_b32_e32 v28, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x50
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v25, s3
; GCN-NEXT: v_mov_b32_e32 v24, s2
; GCN-NEXT: s_add_u32 s2, s0, 64
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v21, s3
; GCN-NEXT: v_mov_b32_e32 v20, s2
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v13, s3
; GCN-NEXT: v_mov_b32_e32 v12, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: float32_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s51
; GCN-O0-NEXT: s_mov_b32 s3, s50
; GCN-O0-NEXT: s_mov_b32 s6, s49
; GCN-O0-NEXT: s_mov_b32 s7, s48
; GCN-O0-NEXT: s_mov_b32 s8, s47
; GCN-O0-NEXT: s_mov_b32 s9, s46
; GCN-O0-NEXT: s_mov_b32 s10, s45
; GCN-O0-NEXT: s_mov_b32 s11, s44
; GCN-O0-NEXT: s_mov_b32 s12, s43
; GCN-O0-NEXT: s_mov_b32 s13, s42
; GCN-O0-NEXT: s_mov_b32 s14, s41
; GCN-O0-NEXT: s_mov_b32 s15, s40
; GCN-O0-NEXT: s_mov_b32 s16, s39
; GCN-O0-NEXT: s_mov_b32 s17, s38
; GCN-O0-NEXT: s_mov_b32 s18, s37
; GCN-O0-NEXT: s_mov_b32 s19, s36
; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s20, s51
; GCN-O0-NEXT: s_mov_b32 s21, s50
; GCN-O0-NEXT: s_mov_b32 s22, s49
; GCN-O0-NEXT: s_mov_b32 s23, s48
; GCN-O0-NEXT: s_mov_b32 s24, s47
; GCN-O0-NEXT: s_mov_b32 s25, s46
; GCN-O0-NEXT: s_mov_b32 s26, s45
; GCN-O0-NEXT: s_mov_b32 s27, s44
; GCN-O0-NEXT: s_mov_b32 s28, s43
; GCN-O0-NEXT: s_mov_b32 s29, s42
; GCN-O0-NEXT: s_mov_b32 s30, s41
; GCN-O0-NEXT: s_mov_b32 s31, s40
; GCN-O0-NEXT: s_mov_b32 s33, s39
; GCN-O0-NEXT: s_mov_b32 s34, s38
; GCN-O0-NEXT: s_mov_b32 s35, s37
; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
; GCN-O0-NEXT: v_mov_b32_e32 v7, s36
; GCN-O0-NEXT: v_mov_b32_e32 v62, s35
; GCN-O0-NEXT: v_mov_b32_e32 v61, s34
; GCN-O0-NEXT: v_mov_b32_e32 v60, s33
; GCN-O0-NEXT: v_mov_b32_e32 v59, s31
; GCN-O0-NEXT: v_mov_b32_e32 v58, s30
; GCN-O0-NEXT: v_mov_b32_e32 v57, s29
; GCN-O0-NEXT: v_mov_b32_e32 v56, s28
; GCN-O0-NEXT: v_mov_b32_e32 v55, s27
; GCN-O0-NEXT: v_mov_b32_e32 v54, s26
; GCN-O0-NEXT: v_mov_b32_e32 v53, s25
; GCN-O0-NEXT: v_mov_b32_e32 v52, s24
; GCN-O0-NEXT: v_mov_b32_e32 v51, s23
; GCN-O0-NEXT: v_mov_b32_e32 v50, s22
; GCN-O0-NEXT: v_mov_b32_e32 v49, s21
; GCN-O0-NEXT: v_mov_b32_e32 v48, s20
; GCN-O0-NEXT: v_mov_b32_e32 v47, s19
; GCN-O0-NEXT: v_mov_b32_e32 v46, s18
; GCN-O0-NEXT: v_mov_b32_e32 v45, s17
; GCN-O0-NEXT: v_mov_b32_e32 v44, s16
; GCN-O0-NEXT: v_mov_b32_e32 v43, s15
; GCN-O0-NEXT: v_mov_b32_e32 v42, s14
; GCN-O0-NEXT: v_mov_b32_e32 v41, s13
; GCN-O0-NEXT: v_mov_b32_e32 v40, s12
; GCN-O0-NEXT: v_mov_b32_e32 v39, s11
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v62
; GCN-O0-NEXT: v_mov_b32_e32 v9, v61
; GCN-O0-NEXT: v_mov_b32_e32 v10, v60
; GCN-O0-NEXT: v_mov_b32_e32 v11, v59
; GCN-O0-NEXT: v_mov_b32_e32 v12, v58
; GCN-O0-NEXT: v_mov_b32_e32 v13, v57
; GCN-O0-NEXT: v_mov_b32_e32 v14, v56
; GCN-O0-NEXT: v_mov_b32_e32 v15, v55
; GCN-O0-NEXT: v_mov_b32_e32 v16, v54
; GCN-O0-NEXT: v_mov_b32_e32 v17, v53
; GCN-O0-NEXT: v_mov_b32_e32 v18, v52
; GCN-O0-NEXT: v_mov_b32_e32 v19, v51
; GCN-O0-NEXT: v_mov_b32_e32 v20, v50
; GCN-O0-NEXT: v_mov_b32_e32 v21, v49
; GCN-O0-NEXT: v_mov_b32_e32 v22, v48
; GCN-O0-NEXT: v_mov_b32_e32 v23, v47
; GCN-O0-NEXT: v_mov_b32_e32 v24, v46
; GCN-O0-NEXT: v_mov_b32_e32 v25, v45
; GCN-O0-NEXT: v_mov_b32_e32 v26, v44
; GCN-O0-NEXT: v_mov_b32_e32 v27, v43
; GCN-O0-NEXT: v_mov_b32_e32 v28, v42
; GCN-O0-NEXT: v_mov_b32_e32 v29, v41
; GCN-O0-NEXT: v_mov_b32_e32 v30, v40
; GCN-O0-NEXT: v_mov_b32_e32 v31, v39
; GCN-O0-NEXT: v_mov_b32_e32 v32, v6
; GCN-O0-NEXT: v_mov_b32_e32 v33, v5
; GCN-O0-NEXT: v_mov_b32_e32 v34, v4
; GCN-O0-NEXT: v_mov_b32_e32 v35, v3
; GCN-O0-NEXT: v_mov_b32_e32 v36, v2
; GCN-O0-NEXT: v_mov_b32_e32 v37, v1
; GCN-O0-NEXT: v_mov_b32_e32 v38, v0
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v38
; GCN-O0-NEXT: v_mov_b32_e32 v1, v37
; GCN-O0-NEXT: v_mov_b32_e32 v6, v36
; GCN-O0-NEXT: v_mov_b32_e32 v2, v35
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v34
; GCN-O0-NEXT: v_mov_b32_e32 v1, v33
; GCN-O0-NEXT: v_mov_b32_e32 v6, v32
; GCN-O0-NEXT: v_mov_b32_e32 v2, v31
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v30
; GCN-O0-NEXT: v_mov_b32_e32 v1, v29
; GCN-O0-NEXT: v_mov_b32_e32 v6, v28
; GCN-O0-NEXT: v_mov_b32_e32 v2, v27
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v26
; GCN-O0-NEXT: v_mov_b32_e32 v1, v25
; GCN-O0-NEXT: v_mov_b32_e32 v6, v24
; GCN-O0-NEXT: v_mov_b32_e32 v2, v23
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 64
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v22
; GCN-O0-NEXT: v_mov_b32_e32 v1, v21
; GCN-O0-NEXT: v_mov_b32_e32 v6, v20
; GCN-O0-NEXT: v_mov_b32_e32 v2, v19
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v18
; GCN-O0-NEXT: v_mov_b32_e32 v1, v17
; GCN-O0-NEXT: v_mov_b32_e32 v6, v16
; GCN-O0-NEXT: v_mov_b32_e32 v2, v15
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 32
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v14
; GCN-O0-NEXT: v_mov_b32_e32 v1, v13
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: v_mov_b32_e32 v2, v11
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v1, v9
; GCN-O0-NEXT: v_mov_b32_e32 v6, v8
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
store <32 x float> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
; GCN-LABEL: half4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
; GCN-NEXT: s_lshl_b32 s6, s6, 4
; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: half4_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34
; GCN-O0-NEXT: s_mov_b32 s7, 0x3c003c00
; GCN-O0-NEXT: s_mov_b32 s0, s7
; GCN-O0-NEXT: s_mov_b32 s1, s7
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
; GCN-O0-NEXT: s_mov_b32 s7, 4
; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff
; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v3, s1
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
store <4 x half> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) {
; GCN-LABEL: half2_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s3, s3, 4
; GCN-NEXT: s_xor_b32 s4, s2, 0x3c003c00
; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3
; GCN-NEXT: s_and_b32 s3, s4, s3
; GCN-NEXT: s_xor_b32 s2, s3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: half2_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c
; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30
; GCN-O0-NEXT: s_mov_b32 s0, 0x3c003c00
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_xor_b32 s0, s1, s0
; GCN-O0-NEXT: s_mov_b32 s5, 4
; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5
; GCN-O0-NEXT: s_mov_b32 s4, 0xffff
; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5
; GCN-O0-NEXT: s_and_b32 s0, s0, s4
; GCN-O0-NEXT: s_xor_b32 s0, s0, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_store_dword v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
store <2 x half> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) {
; GCN-LABEL: half8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s7, s3, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 7
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 6
; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_lshr_b32 s3, s2, 16
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GCN-NEXT: s_cmp_lg_u32 s6, 5
; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_lshr_b32 s2, s1, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GCN-NEXT: s_cmp_lg_u32 s6, 3
; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_lshr_b32 s1, s0, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
; GCN-NEXT: s_cmp_lg_u32 s6, 1
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: half8_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x44
; GCN-O0-NEXT: s_mov_b32 s3, 7
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_and_b32 s2, s2, s3
; GCN-O0-NEXT: s_mov_b32 s3, 1
; GCN-O0-NEXT: s_lshl_b32 s3, s2, s3
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_or_b32 s2, s2, s3
; GCN-O0-NEXT: s_mov_b32 s3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0
; GCN-O0-NEXT: v_mov_b32_e32 v0, 0x3c00
; GCN-O0-NEXT: v_mov_b32_e32 v1, s2
; GCN-O0-NEXT: buffer_store_short v0, v1, s[12:15], 0 offen
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
store <8 x half> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) {
; GCN-LABEL: short2_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s3, s3, 4
; GCN-NEXT: s_xor_b32 s4, s2, 0x10001
; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3
; GCN-NEXT: s_and_b32 s3, s4, s3
; GCN-NEXT: s_xor_b32 s2, s3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: short2_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dword s1, s[4:5], 0x2c
; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x30
; GCN-O0-NEXT: s_mov_b32 s0, 0x10001
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_xor_b32 s0, s1, s0
; GCN-O0-NEXT: s_mov_b32 s5, 4
; GCN-O0-NEXT: s_lshl_b32 s5, s4, s5
; GCN-O0-NEXT: s_mov_b32 s4, 0xffff
; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5
; GCN-O0-NEXT: s_and_b32 s0, s0, s4
; GCN-O0-NEXT: s_xor_b32 s0, s0, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_store_dword v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <2 x i16> %vec, i16 1, i32 %sel
store <2 x i16> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
; GCN-LABEL: short4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x10001
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
; GCN-NEXT: s_lshl_b32 s6, s6, 4
; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: short4_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34
; GCN-O0-NEXT: s_mov_b32 s7, 0x10001
; GCN-O0-NEXT: s_mov_b32 s0, s7
; GCN-O0-NEXT: s_mov_b32 s1, s7
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
; GCN-O0-NEXT: s_mov_b32 s7, 4
; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xffff
; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; GCN-O0-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v3, s1
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x i16> %vec, i16 1, i32 %sel
store <4 x i16> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
; GCN-LABEL: byte8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101
; GCN-NEXT: s_lshl_b32 s6, s6, 3
; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101
; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: byte8_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b64 s[0:1], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GCN-O0-NEXT: s_load_dword s6, s[0:1], 0x34
; GCN-O0-NEXT: s_mov_b32 s7, 0x1010101
; GCN-O0-NEXT: s_mov_b32 s0, s7
; GCN-O0-NEXT: s_mov_b32 s1, s7
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_mov_b32 s7, 3
; GCN-O0-NEXT: s_lshl_b32 s8, s6, s7
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0xff
; GCN-O0-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; GCN-O0-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
; GCN-O0-NEXT: s_mov_b32 s3, s10
; GCN-O0-NEXT: s_mov_b32 s0, 8
; GCN-O0-NEXT: s_lshr_b32 s0, s3, s0
; GCN-O0-NEXT: s_mov_b32 s1, s10
; GCN-O0-NEXT: s_mov_b32 s2, 16
; GCN-O0-NEXT: s_lshr_b32 s2, s3, s2
; GCN-O0-NEXT: s_mov_b32 s6, 24
; GCN-O0-NEXT: s_lshr_b32 s3, s3, s6
; GCN-O0-NEXT: s_mov_b32 s6, 32
; GCN-O0-NEXT: s_lshr_b64 s[6:7], s[10:11], s6
; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s7, 40
; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s7
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s8, 48
; GCN-O0-NEXT: s_lshr_b64 s[8:9], s[10:11], s8
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, 56
; GCN-O0-NEXT: s_lshr_b64 s[10:11], s[10:11], s9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: s_mov_b64 s[14:15], 7
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s11, s5
; GCN-O0-NEXT: s_mov_b32 s13, s14
; GCN-O0-NEXT: s_mov_b32 s12, s15
; GCN-O0-NEXT: s_add_u32 s10, s10, s13
; GCN-O0-NEXT: s_addc_u32 s12, s11, s12
; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: v_mov_b32_e32 v0, s10
; GCN-O0-NEXT: v_mov_b32_e32 v1, s11
; GCN-O0-NEXT: v_mov_b32_e32 v2, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[14:15], 6
; GCN-O0-NEXT: s_mov_b32 s10, s4
; GCN-O0-NEXT: s_mov_b32 s9, s5
; GCN-O0-NEXT: s_mov_b32 s12, s14
; GCN-O0-NEXT: s_mov_b32 s11, s15
; GCN-O0-NEXT: s_add_u32 s10, s10, s12
; GCN-O0-NEXT: s_addc_u32 s9, s9, s11
; GCN-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
; GCN-O0-NEXT: s_mov_b32 s11, s9
; GCN-O0-NEXT: v_mov_b32_e32 v0, s10
; GCN-O0-NEXT: v_mov_b32_e32 v1, s11
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[12:13], 5
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s9, s5
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: v_mov_b32_e32 v2, s7
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[12:13], 4
; GCN-O0-NEXT: s_mov_b32 s8, s4
; GCN-O0-NEXT: s_mov_b32 s7, s5
; GCN-O0-NEXT: s_mov_b32 s10, s12
; GCN-O0-NEXT: s_mov_b32 s9, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s10
; GCN-O0-NEXT: s_addc_u32 s7, s7, s9
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s7
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[10:11], 3
; GCN-O0-NEXT: s_mov_b32 s6, s4
; GCN-O0-NEXT: s_mov_b32 s7, s5
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: s_mov_b32 s8, s11
; GCN-O0-NEXT: s_add_u32 s6, s6, s9
; GCN-O0-NEXT: s_addc_u32 s8, s7, s8
; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[10:11], 2
; GCN-O0-NEXT: s_mov_b32 s6, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s8, s10
; GCN-O0-NEXT: s_mov_b32 s7, s11
; GCN-O0-NEXT: s_add_u32 s6, s6, s8
; GCN-O0-NEXT: s_addc_u32 s3, s3, s7
; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b32 s7, s3
; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s2
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_mov_b64 s[6:7], 1
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s4, s6
; GCN-O0-NEXT: s_mov_b32 s3, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s4
; GCN-O0-NEXT: s_addc_u32 s1, s1, s3
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
store <8 x i8> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) {
; GCN-LABEL: byte16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s7, s3, 24
; GCN-NEXT: s_cmp_lg_u32 s6, 15
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshr_b32 s8, s3, 16
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 14
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshr_b32 s9, s3, 8
; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 13
; GCN-NEXT: s_cselect_b32 s8, s9, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 12
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 0xff
; GCN-NEXT: s_or_b32 s3, s3, s8
; GCN-NEXT: s_and_b32 s3, s3, 0xffff
; GCN-NEXT: s_or_b32 s3, s3, s7
; GCN-NEXT: s_lshr_b32 s7, s2, 24
; GCN-NEXT: s_cmp_lg_u32 s6, 11
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_lshr_b32 s8, s2, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 10
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_lshr_b32 s8, s2, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 9
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 8
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 0xff
; GCN-NEXT: s_or_b32 s2, s2, s8
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
; GCN-NEXT: s_or_b32 s2, s2, s7
; GCN-NEXT: s_lshr_b32 s7, s1, 24
; GCN-NEXT: s_cmp_lg_u32 s6, 7
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_lshr_b32 s8, s1, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 6
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_lshr_b32 s8, s1, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 5
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 4
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_and_b32 s1, s1, 0xff
; GCN-NEXT: s_or_b32 s1, s1, s8
; GCN-NEXT: s_and_b32 s1, s1, 0xffff
; GCN-NEXT: s_or_b32 s1, s1, s7
; GCN-NEXT: s_lshr_b32 s7, s0, 24
; GCN-NEXT: s_cmp_lg_u32 s6, 3
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_lshr_b32 s8, s0, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 2
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 0xff
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 16
; GCN-NEXT: s_lshr_b32 s8, s0, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 1
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 1
; GCN-NEXT: s_and_b32 s0, s0, 0xff
; GCN-NEXT: s_or_b32 s0, s0, s8
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_or_b32 s0, s0, s7
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: byte16_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 52
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 53
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v2, s1
; GCN-O0-NEXT: v_mov_b32_e32 v1, s0
; GCN-O0-NEXT: flat_load_ubyte v1, v[1:2]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 54
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v3, s1
; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
; GCN-O0-NEXT: flat_load_ubyte v2, v[2:3]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 55
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v4, s1
; GCN-O0-NEXT: v_mov_b32_e32 v3, s0
; GCN-O0-NEXT: flat_load_ubyte v3, v[3:4]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 56
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v5, s1
; GCN-O0-NEXT: v_mov_b32_e32 v4, s0
; GCN-O0-NEXT: flat_load_ubyte v4, v[4:5]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 57
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v6, s1
; GCN-O0-NEXT: v_mov_b32_e32 v5, s0
; GCN-O0-NEXT: flat_load_ubyte v5, v[5:6]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 58
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v7, s1
; GCN-O0-NEXT: v_mov_b32_e32 v6, s0
; GCN-O0-NEXT: flat_load_ubyte v6, v[6:7]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 59
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v8, s1
; GCN-O0-NEXT: v_mov_b32_e32 v7, s0
; GCN-O0-NEXT: flat_load_ubyte v7, v[7:8]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 60
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v9, s1
; GCN-O0-NEXT: v_mov_b32_e32 v8, s0
; GCN-O0-NEXT: flat_load_ubyte v8, v[8:9]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 61
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v10, s1
; GCN-O0-NEXT: v_mov_b32_e32 v9, s0
; GCN-O0-NEXT: flat_load_ubyte v9, v[9:10]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 62
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v11, s1
; GCN-O0-NEXT: v_mov_b32_e32 v10, s0
; GCN-O0-NEXT: flat_load_ubyte v10, v[10:11]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 63
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v12, s1
; GCN-O0-NEXT: v_mov_b32_e32 v11, s0
; GCN-O0-NEXT: flat_load_ubyte v11, v[11:12]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 64
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v13, s1
; GCN-O0-NEXT: v_mov_b32_e32 v12, s0
; GCN-O0-NEXT: flat_load_ubyte v12, v[12:13]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x41
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v14, s1
; GCN-O0-NEXT: v_mov_b32_e32 v13, s0
; GCN-O0-NEXT: flat_load_ubyte v13, v[13:14]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x42
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v15, s1
; GCN-O0-NEXT: v_mov_b32_e32 v14, s0
; GCN-O0-NEXT: flat_load_ubyte v14, v[14:15]
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x43
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v16, s1
; GCN-O0-NEXT: v_mov_b32_e32 v15, s0
; GCN-O0-NEXT: flat_load_ubyte v15, v[15:16]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44
; GCN-O0-NEXT: s_mov_b32 s3, 15
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_and_b32 s3, s2, s3
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_or_b32 s2, s2, s3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v15, off, s[12:15], 0 offset:15
; GCN-O0-NEXT: buffer_store_byte v14, off, s[12:15], 0 offset:14
; GCN-O0-NEXT: buffer_store_byte v13, off, s[12:15], 0 offset:13
; GCN-O0-NEXT: buffer_store_byte v12, off, s[12:15], 0 offset:12
; GCN-O0-NEXT: buffer_store_byte v11, off, s[12:15], 0 offset:11
; GCN-O0-NEXT: buffer_store_byte v10, off, s[12:15], 0 offset:10
; GCN-O0-NEXT: buffer_store_byte v9, off, s[12:15], 0 offset:9
; GCN-O0-NEXT: buffer_store_byte v8, off, s[12:15], 0 offset:8
; GCN-O0-NEXT: buffer_store_byte v7, off, s[12:15], 0 offset:7
; GCN-O0-NEXT: buffer_store_byte v6, off, s[12:15], 0 offset:6
; GCN-O0-NEXT: buffer_store_byte v5, off, s[12:15], 0 offset:5
; GCN-O0-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:4
; GCN-O0-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:3
; GCN-O0-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:2
; GCN-O0-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1
; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
; GCN-O0-NEXT: v_mov_b32_e32 v1, s2
; GCN-O0-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen
; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0
; GCN-O0-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:1
; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:2
; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[12:15], 0 offset:3
; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[12:15], 0 offset:4
; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[12:15], 0 offset:5
; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[12:15], 0 offset:6
; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[12:15], 0 offset:7
; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[12:15], 0 offset:8
; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[12:15], 0 offset:9
; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[12:15], 0 offset:10
; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[12:15], 0 offset:11
; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[12:15], 0 offset:12
; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[12:15], 0 offset:13
; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[12:15], 0 offset:14
; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[12:15], 0 offset:15
; GCN-O0-NEXT: s_mov_b64 s[6:7], 15
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: flat_store_byte v[0:1], v17
; GCN-O0-NEXT: s_mov_b64 s[6:7], 14
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v16
; GCN-O0-NEXT: s_mov_b64 s[6:7], 13
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v15
; GCN-O0-NEXT: s_mov_b64 s[6:7], 12
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v14
; GCN-O0-NEXT: s_mov_b64 s[6:7], 11
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v13
; GCN-O0-NEXT: s_mov_b64 s[6:7], 10
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v12
; GCN-O0-NEXT: s_mov_b64 s[6:7], 9
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v11
; GCN-O0-NEXT: s_mov_b64 s[6:7], 8
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v10
; GCN-O0-NEXT: s_mov_b64 s[6:7], 7
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v9
; GCN-O0-NEXT: s_mov_b64 s[6:7], 6
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v8
; GCN-O0-NEXT: s_mov_b64 s[6:7], 5
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v7
; GCN-O0-NEXT: s_mov_b64 s[6:7], 4
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v6
; GCN-O0-NEXT: s_mov_b64 s[6:7], 3
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v5
; GCN-O0-NEXT: s_mov_b64 s[6:7], 2
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v4
; GCN-O0-NEXT: s_mov_b64 s[6:7], 1
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_byte v[0:1], v3
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <16 x i8> %vec, i8 1, i32 %sel
store <16 x i8> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
; GCN-LABEL: double2_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s6, 1
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
; GCN-NEXT: s_cmp_eq_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1
; GCN-NEXT: s_cselect_b32 s0, 0, s0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double2_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44
; GCN-O0-NEXT: s_mov_b32 s3, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v2, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s11
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v3, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
store <2 x double> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s12, s[4:5], 0xa4
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x84
; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s12, 4
; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
; GCN-NEXT: s_cselect_b32 s8, 0, s8
; GCN-NEXT: s_cmp_eq_u32 s12, 1
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
; GCN-NEXT: s_cmp_eq_u32 s12, 0
; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
; GCN-NEXT: s_cselect_b32 s14, 0, s0
; GCN-NEXT: s_cmp_eq_u32 s12, 3
; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
; GCN-NEXT: s_cselect_b32 s1, 0, s6
; GCN-NEXT: s_cmp_eq_u32 s12, 2
; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: s_add_u32 s0, s10, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_addc_u32 s1, s11, 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: v_mov_b32_e32 v4, s10
; GCN-NEXT: s_add_u32 s0, s10, 32
; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s11
; GCN-NEXT: s_addc_u32 s1, s11, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double5_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x84
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s10, s1
; GCN-O0-NEXT: s_mov_b32 s11, s0
; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s12, s27
; GCN-O0-NEXT: s_mov_b32 s13, s26
; GCN-O0-NEXT: s_mov_b32 s14, s25
; GCN-O0-NEXT: s_mov_b32 s15, s24
; GCN-O0-NEXT: s_mov_b32 s16, s23
; GCN-O0-NEXT: s_mov_b32 s17, s22
; GCN-O0-NEXT: s_mov_b32 s18, s21
; GCN-O0-NEXT: s_mov_b32 s19, s20
; GCN-O0-NEXT: ; implicit-def: $sgpr9
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr8
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr7
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr6
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr1
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr20
; GCN-O0-NEXT: v_mov_b32_e32 v1, s19
; GCN-O0-NEXT: v_mov_b32_e32 v30, s18
; GCN-O0-NEXT: v_mov_b32_e32 v29, s17
; GCN-O0-NEXT: v_mov_b32_e32 v28, s16
; GCN-O0-NEXT: v_mov_b32_e32 v27, s15
; GCN-O0-NEXT: v_mov_b32_e32 v26, s14
; GCN-O0-NEXT: v_mov_b32_e32 v25, s13
; GCN-O0-NEXT: v_mov_b32_e32 v24, s12
; GCN-O0-NEXT: v_mov_b32_e32 v23, s11
; GCN-O0-NEXT: v_mov_b32_e32 v22, s10
; GCN-O0-NEXT: v_mov_b32_e32 v21, s9
; GCN-O0-NEXT: v_mov_b32_e32 v20, s8
; GCN-O0-NEXT: v_mov_b32_e32 v19, s7
; GCN-O0-NEXT: v_mov_b32_e32 v18, s6
; GCN-O0-NEXT: v_mov_b32_e32 v17, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v2, v30
; GCN-O0-NEXT: v_mov_b32_e32 v3, v29
; GCN-O0-NEXT: v_mov_b32_e32 v4, v28
; GCN-O0-NEXT: v_mov_b32_e32 v5, v27
; GCN-O0-NEXT: v_mov_b32_e32 v6, v26
; GCN-O0-NEXT: v_mov_b32_e32 v7, v25
; GCN-O0-NEXT: v_mov_b32_e32 v8, v24
; GCN-O0-NEXT: v_mov_b32_e32 v9, v23
; GCN-O0-NEXT: v_mov_b32_e32 v10, v22
; GCN-O0-NEXT: v_mov_b32_e32 v11, v21
; GCN-O0-NEXT: v_mov_b32_e32 v12, v20
; GCN-O0-NEXT: v_mov_b32_e32 v13, v19
; GCN-O0-NEXT: v_mov_b32_e32 v14, v18
; GCN-O0-NEXT: v_mov_b32_e32 v15, v17
; GCN-O0-NEXT: v_mov_b32_e32 v16, v0
; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4
; GCN-O0-NEXT: s_mov_b32 s1, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v1, v0
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: v_mov_b32_e32 v17, v3
; GCN-O0-NEXT: v_mov_b32_e32 v18, v2
; GCN-O0-NEXT: v_mov_b32_e32 v19, v1
; GCN-O0-NEXT: v_mov_b32_e32 v20, v8
; GCN-O0-NEXT: v_mov_b32_e32 v21, v7
; GCN-O0-NEXT: v_mov_b32_e32 v26, v6
; GCN-O0-NEXT: v_mov_b32_e32 v22, v5
; GCN-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23_vgpr24_vgpr25 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v23, v26
; GCN-O0-NEXT: v_mov_b32_e32 v24, v21
; GCN-O0-NEXT: v_mov_b32_e32 v25, v20
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v21, s1
; GCN-O0-NEXT: v_mov_b32_e32 v20, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[20:21], v[22:25]
; GCN-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v20, v18
; GCN-O0-NEXT: v_mov_b32_e32 v21, v17
; GCN-O0-NEXT: v_mov_b32_e32 v22, v0
; GCN-O0-NEXT: v_mov_b32_e32 v18, s3
; GCN-O0-NEXT: v_mov_b32_e32 v17, s2
; GCN-O0-NEXT: flat_store_dwordx4 v[17:18], v[19:22]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v2, v9
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
; GCN-O0-NEXT: s_mov_b64 s[4:5], 32
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: s_mov_b32 s2, s5
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
store <5 x double> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
; GCN-LABEL: double8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[4:5], 0xa4
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 m0, s2, 1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v8, s16
; GCN-NEXT: v_mov_b32_e32 v9, s17
; GCN-NEXT: v_mov_b32_e32 v10, s18
; GCN-NEXT: v_mov_b32_e32 v11, s19
; GCN-NEXT: v_mov_b32_e32 v12, s20
; GCN-NEXT: v_mov_b32_e32 v13, s21
; GCN-NEXT: v_mov_b32_e32 v14, s22
; GCN-NEXT: v_mov_b32_e32 v15, s23
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v16
; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v13, s3
; GCN-NEXT: v_mov_b32_e32 v12, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double8_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0xa4
; GCN-O0-NEXT: s_mov_b32 s3, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v7, s8
; GCN-O0-NEXT: v_mov_b32_e32 v8, s9
; GCN-O0-NEXT: v_mov_b32_e32 v9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v10, s11
; GCN-O0-NEXT: v_mov_b32_e32 v11, s12
; GCN-O0-NEXT: v_mov_b32_e32 v12, s13
; GCN-O0-NEXT: v_mov_b32_e32 v13, s14
; GCN-O0-NEXT: v_mov_b32_e32 v14, s15
; GCN-O0-NEXT: v_mov_b32_e32 v15, s16
; GCN-O0-NEXT: v_mov_b32_e32 v16, s17
; GCN-O0-NEXT: v_mov_b32_e32 v17, s18
; GCN-O0-NEXT: v_mov_b32_e32 v18, s19
; GCN-O0-NEXT: v_mov_b32_e32 v19, s20
; GCN-O0-NEXT: v_mov_b32_e32 v20, s21
; GCN-O0-NEXT: v_mov_b32_e32 v21, s22
; GCN-O0-NEXT: v_mov_b32_e32 v22, s23
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v22
; GCN-O0-NEXT: v_mov_b32_e32 v1, v21
; GCN-O0-NEXT: v_mov_b32_e32 v6, v20
; GCN-O0-NEXT: v_mov_b32_e32 v2, v19
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v18
; GCN-O0-NEXT: v_mov_b32_e32 v1, v17
; GCN-O0-NEXT: v_mov_b32_e32 v6, v16
; GCN-O0-NEXT: v_mov_b32_e32 v2, v15
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 32
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v14
; GCN-O0-NEXT: v_mov_b32_e32 v1, v13
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: v_mov_b32_e32 v2, v11
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v1, v9
; GCN-O0-NEXT: v_mov_b32_e32 v6, v8
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
store <8 x double> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) {
; GCN-LABEL: double7_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x94
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x84
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: s_load_dword s0, s[4:5], 0xa4
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v9, s1
; GCN-NEXT: v_mov_b32_e32 v10, s2
; GCN-NEXT: v_mov_b32_e32 v11, s3
; GCN-NEXT: v_mov_b32_e32 v12, s16
; GCN-NEXT: v_mov_b32_e32 v13, s17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_add_u32 s0, s6, 16
; GCN-NEXT: v_movreld_b32_e32 v1, v16
; GCN-NEXT: s_addc_u32 s1, s7, 0
; GCN-NEXT: v_mov_b32_e32 v15, s1
; GCN-NEXT: v_mov_b32_e32 v14, s0
; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
; GCN-NEXT: s_add_u32 s0, s6, 48
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s7
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_addc_u32 s1, s7, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_add_u32 s0, s6, 32
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13]
; GCN-NEXT: s_addc_u32 s1, s7, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double7_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x94
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s6, s1
; GCN-O0-NEXT: s_mov_b32 s7, s0
; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x84
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s8, s15
; GCN-O0-NEXT: s_mov_b32 s9, s14
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x64
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s12, s27
; GCN-O0-NEXT: s_mov_b32 s13, s26
; GCN-O0-NEXT: s_mov_b32 s14, s25
; GCN-O0-NEXT: s_mov_b32 s15, s24
; GCN-O0-NEXT: s_mov_b32 s16, s23
; GCN-O0-NEXT: s_mov_b32 s17, s22
; GCN-O0-NEXT: s_mov_b32 s18, s21
; GCN-O0-NEXT: s_mov_b32 s19, s20
; GCN-O0-NEXT: ; implicit-def: $sgpr1
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr20
; GCN-O0-NEXT: v_mov_b32_e32 v7, s19
; GCN-O0-NEXT: v_mov_b32_e32 v30, s18
; GCN-O0-NEXT: v_mov_b32_e32 v29, s17
; GCN-O0-NEXT: v_mov_b32_e32 v28, s16
; GCN-O0-NEXT: v_mov_b32_e32 v27, s15
; GCN-O0-NEXT: v_mov_b32_e32 v26, s14
; GCN-O0-NEXT: v_mov_b32_e32 v25, s13
; GCN-O0-NEXT: v_mov_b32_e32 v24, s12
; GCN-O0-NEXT: v_mov_b32_e32 v23, s11
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v30
; GCN-O0-NEXT: v_mov_b32_e32 v9, v29
; GCN-O0-NEXT: v_mov_b32_e32 v10, v28
; GCN-O0-NEXT: v_mov_b32_e32 v11, v27
; GCN-O0-NEXT: v_mov_b32_e32 v12, v26
; GCN-O0-NEXT: v_mov_b32_e32 v13, v25
; GCN-O0-NEXT: v_mov_b32_e32 v14, v24
; GCN-O0-NEXT: v_mov_b32_e32 v15, v23
; GCN-O0-NEXT: v_mov_b32_e32 v16, v6
; GCN-O0-NEXT: v_mov_b32_e32 v17, v5
; GCN-O0-NEXT: v_mov_b32_e32 v18, v4
; GCN-O0-NEXT: v_mov_b32_e32 v19, v3
; GCN-O0-NEXT: v_mov_b32_e32 v20, v2
; GCN-O0-NEXT: v_mov_b32_e32 v21, v1
; GCN-O0-NEXT: v_mov_b32_e32 v22, v0
; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0xa4
; GCN-O0-NEXT: s_mov_b32 s1, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v18
; GCN-O0-NEXT: v_mov_b32_e32 v1, v17
; GCN-O0-NEXT: v_mov_b32_e32 v6, v16
; GCN-O0-NEXT: v_mov_b32_e32 v2, v15
; GCN-O0-NEXT: v_mov_b32_e32 v3, v10
; GCN-O0-NEXT: v_mov_b32_e32 v4, v9
; GCN-O0-NEXT: v_mov_b32_e32 v5, v8
; GCN-O0-NEXT: v_mov_b32_e32 v23, v7
; GCN-O0-NEXT: v_mov_b32_e32 v24, v14
; GCN-O0-NEXT: v_mov_b32_e32 v25, v13
; GCN-O0-NEXT: v_mov_b32_e32 v30, v12
; GCN-O0-NEXT: v_mov_b32_e32 v26, v11
; GCN-O0-NEXT: ; kill: def $vgpr26 killed $vgpr26 def $vgpr26_vgpr27_vgpr28_vgpr29 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v27, v30
; GCN-O0-NEXT: v_mov_b32_e32 v28, v25
; GCN-O0-NEXT: v_mov_b32_e32 v29, v24
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v25, s1
; GCN-O0-NEXT: v_mov_b32_e32 v24, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[24:25], v[26:29]
; GCN-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v24, v5
; GCN-O0-NEXT: v_mov_b32_e32 v25, v4
; GCN-O0-NEXT: v_mov_b32_e32 v26, v3
; GCN-O0-NEXT: v_mov_b32_e32 v4, s3
; GCN-O0-NEXT: v_mov_b32_e32 v3, s2
; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; GCN-O0-NEXT: v_mov_b32_e32 v3, v20
; GCN-O0-NEXT: v_mov_b32_e32 v7, v19
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v3
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v4, s1
; GCN-O0-NEXT: v_mov_b32_e32 v3, s0
; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8]
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[4:5], 32
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: s_mov_b32 s2, s5
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
store <7 x double> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) {
; GCN-LABEL: double16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x124
; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
; GCN-NEXT: v_mov_b32_e32 v5, s41
; GCN-NEXT: v_mov_b32_e32 v6, s42
; GCN-NEXT: v_mov_b32_e32 v7, s43
; GCN-NEXT: v_mov_b32_e32 v8, s44
; GCN-NEXT: v_mov_b32_e32 v9, s45
; GCN-NEXT: v_mov_b32_e32 v10, s46
; GCN-NEXT: v_mov_b32_e32 v11, s47
; GCN-NEXT: v_mov_b32_e32 v12, s48
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
; GCN-NEXT: v_mov_b32_e32 v16, s8
; GCN-NEXT: v_mov_b32_e32 v17, s9
; GCN-NEXT: v_mov_b32_e32 v18, s10
; GCN-NEXT: v_mov_b32_e32 v19, s11
; GCN-NEXT: v_mov_b32_e32 v20, s12
; GCN-NEXT: v_mov_b32_e32 v21, s13
; GCN-NEXT: v_mov_b32_e32 v22, s14
; GCN-NEXT: v_mov_b32_e32 v23, s15
; GCN-NEXT: v_mov_b32_e32 v24, s16
; GCN-NEXT: v_mov_b32_e32 v25, s17
; GCN-NEXT: v_mov_b32_e32 v26, s18
; GCN-NEXT: v_mov_b32_e32 v27, s19
; GCN-NEXT: v_mov_b32_e32 v28, s20
; GCN-NEXT: v_mov_b32_e32 v29, s21
; GCN-NEXT: v_mov_b32_e32 v30, s22
; GCN-NEXT: v_mov_b32_e32 v31, s23
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s2, s0, 0x70
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v32
; GCN-NEXT: v_mov_b32_e32 v33, s3
; GCN-NEXT: v_mov_b32_e32 v32, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x60
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v29, s3
; GCN-NEXT: v_mov_b32_e32 v28, s2
; GCN-NEXT: s_add_u32 s2, s0, 0x50
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v25, s3
; GCN-NEXT: v_mov_b32_e32 v24, s2
; GCN-NEXT: s_add_u32 s2, s0, 64
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v21, s3
; GCN-NEXT: v_mov_b32_e32 v20, s2
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v13, s3
; GCN-NEXT: v_mov_b32_e32 v12, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double16_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xe4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s2, s51
; GCN-O0-NEXT: s_mov_b32 s3, s50
; GCN-O0-NEXT: s_mov_b32 s6, s49
; GCN-O0-NEXT: s_mov_b32 s7, s48
; GCN-O0-NEXT: s_mov_b32 s8, s47
; GCN-O0-NEXT: s_mov_b32 s9, s46
; GCN-O0-NEXT: s_mov_b32 s10, s45
; GCN-O0-NEXT: s_mov_b32 s11, s44
; GCN-O0-NEXT: s_mov_b32 s12, s43
; GCN-O0-NEXT: s_mov_b32 s13, s42
; GCN-O0-NEXT: s_mov_b32 s14, s41
; GCN-O0-NEXT: s_mov_b32 s15, s40
; GCN-O0-NEXT: s_mov_b32 s16, s39
; GCN-O0-NEXT: s_mov_b32 s17, s38
; GCN-O0-NEXT: s_mov_b32 s18, s37
; GCN-O0-NEXT: s_mov_b32 s19, s36
; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s20, s51
; GCN-O0-NEXT: s_mov_b32 s21, s50
; GCN-O0-NEXT: s_mov_b32 s22, s49
; GCN-O0-NEXT: s_mov_b32 s23, s48
; GCN-O0-NEXT: s_mov_b32 s24, s47
; GCN-O0-NEXT: s_mov_b32 s25, s46
; GCN-O0-NEXT: s_mov_b32 s26, s45
; GCN-O0-NEXT: s_mov_b32 s27, s44
; GCN-O0-NEXT: s_mov_b32 s28, s43
; GCN-O0-NEXT: s_mov_b32 s29, s42
; GCN-O0-NEXT: s_mov_b32 s30, s41
; GCN-O0-NEXT: s_mov_b32 s31, s40
; GCN-O0-NEXT: s_mov_b32 s33, s39
; GCN-O0-NEXT: s_mov_b32 s34, s38
; GCN-O0-NEXT: s_mov_b32 s35, s37
; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
; GCN-O0-NEXT: v_mov_b32_e32 v7, s36
; GCN-O0-NEXT: v_mov_b32_e32 v62, s35
; GCN-O0-NEXT: v_mov_b32_e32 v61, s34
; GCN-O0-NEXT: v_mov_b32_e32 v60, s33
; GCN-O0-NEXT: v_mov_b32_e32 v59, s31
; GCN-O0-NEXT: v_mov_b32_e32 v58, s30
; GCN-O0-NEXT: v_mov_b32_e32 v57, s29
; GCN-O0-NEXT: v_mov_b32_e32 v56, s28
; GCN-O0-NEXT: v_mov_b32_e32 v55, s27
; GCN-O0-NEXT: v_mov_b32_e32 v54, s26
; GCN-O0-NEXT: v_mov_b32_e32 v53, s25
; GCN-O0-NEXT: v_mov_b32_e32 v52, s24
; GCN-O0-NEXT: v_mov_b32_e32 v51, s23
; GCN-O0-NEXT: v_mov_b32_e32 v50, s22
; GCN-O0-NEXT: v_mov_b32_e32 v49, s21
; GCN-O0-NEXT: v_mov_b32_e32 v48, s20
; GCN-O0-NEXT: v_mov_b32_e32 v47, s19
; GCN-O0-NEXT: v_mov_b32_e32 v46, s18
; GCN-O0-NEXT: v_mov_b32_e32 v45, s17
; GCN-O0-NEXT: v_mov_b32_e32 v44, s16
; GCN-O0-NEXT: v_mov_b32_e32 v43, s15
; GCN-O0-NEXT: v_mov_b32_e32 v42, s14
; GCN-O0-NEXT: v_mov_b32_e32 v41, s13
; GCN-O0-NEXT: v_mov_b32_e32 v40, s12
; GCN-O0-NEXT: v_mov_b32_e32 v39, s11
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v62
; GCN-O0-NEXT: v_mov_b32_e32 v9, v61
; GCN-O0-NEXT: v_mov_b32_e32 v10, v60
; GCN-O0-NEXT: v_mov_b32_e32 v11, v59
; GCN-O0-NEXT: v_mov_b32_e32 v12, v58
; GCN-O0-NEXT: v_mov_b32_e32 v13, v57
; GCN-O0-NEXT: v_mov_b32_e32 v14, v56
; GCN-O0-NEXT: v_mov_b32_e32 v15, v55
; GCN-O0-NEXT: v_mov_b32_e32 v16, v54
; GCN-O0-NEXT: v_mov_b32_e32 v17, v53
; GCN-O0-NEXT: v_mov_b32_e32 v18, v52
; GCN-O0-NEXT: v_mov_b32_e32 v19, v51
; GCN-O0-NEXT: v_mov_b32_e32 v20, v50
; GCN-O0-NEXT: v_mov_b32_e32 v21, v49
; GCN-O0-NEXT: v_mov_b32_e32 v22, v48
; GCN-O0-NEXT: v_mov_b32_e32 v23, v47
; GCN-O0-NEXT: v_mov_b32_e32 v24, v46
; GCN-O0-NEXT: v_mov_b32_e32 v25, v45
; GCN-O0-NEXT: v_mov_b32_e32 v26, v44
; GCN-O0-NEXT: v_mov_b32_e32 v27, v43
; GCN-O0-NEXT: v_mov_b32_e32 v28, v42
; GCN-O0-NEXT: v_mov_b32_e32 v29, v41
; GCN-O0-NEXT: v_mov_b32_e32 v30, v40
; GCN-O0-NEXT: v_mov_b32_e32 v31, v39
; GCN-O0-NEXT: v_mov_b32_e32 v32, v6
; GCN-O0-NEXT: v_mov_b32_e32 v33, v5
; GCN-O0-NEXT: v_mov_b32_e32 v34, v4
; GCN-O0-NEXT: v_mov_b32_e32 v35, v3
; GCN-O0-NEXT: v_mov_b32_e32 v36, v2
; GCN-O0-NEXT: v_mov_b32_e32 v37, v1
; GCN-O0-NEXT: v_mov_b32_e32 v38, v0
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x124
; GCN-O0-NEXT: s_mov_b32 s3, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s2, s2, s3
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v38
; GCN-O0-NEXT: v_mov_b32_e32 v1, v37
; GCN-O0-NEXT: v_mov_b32_e32 v6, v36
; GCN-O0-NEXT: v_mov_b32_e32 v2, v35
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v34
; GCN-O0-NEXT: v_mov_b32_e32 v1, v33
; GCN-O0-NEXT: v_mov_b32_e32 v6, v32
; GCN-O0-NEXT: v_mov_b32_e32 v2, v31
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x60
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v30
; GCN-O0-NEXT: v_mov_b32_e32 v1, v29
; GCN-O0-NEXT: v_mov_b32_e32 v6, v28
; GCN-O0-NEXT: v_mov_b32_e32 v2, v27
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v26
; GCN-O0-NEXT: v_mov_b32_e32 v1, v25
; GCN-O0-NEXT: v_mov_b32_e32 v6, v24
; GCN-O0-NEXT: v_mov_b32_e32 v2, v23
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 64
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v22
; GCN-O0-NEXT: v_mov_b32_e32 v1, v21
; GCN-O0-NEXT: v_mov_b32_e32 v6, v20
; GCN-O0-NEXT: v_mov_b32_e32 v2, v19
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v18
; GCN-O0-NEXT: v_mov_b32_e32 v1, v17
; GCN-O0-NEXT: v_mov_b32_e32 v6, v16
; GCN-O0-NEXT: v_mov_b32_e32 v2, v15
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 32
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v14
; GCN-O0-NEXT: v_mov_b32_e32 v1, v13
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: v_mov_b32_e32 v2, v11
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s2, s0
; GCN-O0-NEXT: s_mov_b32 s3, s1
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s2, s2, s5
; GCN-O0-NEXT: s_addc_u32 s4, s3, s4
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: v_mov_b32_e32 v0, v10
; GCN-O0-NEXT: v_mov_b32_e32 v1, v9
; GCN-O0-NEXT: v_mov_b32_e32 v6, v8
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
store <16 x double> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) {
; GCN-LABEL: double15_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x104
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0xe4
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v24, s0
; GCN-NEXT: s_load_dword s0, s[4:5], 0x124
; GCN-NEXT: v_mov_b32_e32 v25, s1
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_mov_b32_e32 v4, s12
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 m0, s0, 1
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v5, s13
; GCN-NEXT: v_mov_b32_e32 v6, s14
; GCN-NEXT: v_mov_b32_e32 v7, s15
; GCN-NEXT: v_mov_b32_e32 v8, s16
; GCN-NEXT: v_mov_b32_e32 v9, s17
; GCN-NEXT: v_mov_b32_e32 v10, s18
; GCN-NEXT: v_mov_b32_e32 v11, s19
; GCN-NEXT: v_mov_b32_e32 v12, s20
; GCN-NEXT: v_mov_b32_e32 v13, s21
; GCN-NEXT: v_mov_b32_e32 v14, s22
; GCN-NEXT: v_mov_b32_e32 v15, s23
; GCN-NEXT: v_mov_b32_e32 v16, s24
; GCN-NEXT: v_mov_b32_e32 v17, s25
; GCN-NEXT: v_mov_b32_e32 v18, s26
; GCN-NEXT: v_mov_b32_e32 v19, s27
; GCN-NEXT: v_mov_b32_e32 v20, s28
; GCN-NEXT: v_mov_b32_e32 v21, s29
; GCN-NEXT: v_mov_b32_e32 v22, s30
; GCN-NEXT: v_mov_b32_e32 v23, s31
; GCN-NEXT: v_mov_b32_e32 v26, s2
; GCN-NEXT: v_mov_b32_e32 v27, s3
; GCN-NEXT: v_mov_b32_e32 v28, s6
; GCN-NEXT: v_mov_b32_e32 v29, s7
; GCN-NEXT: v_movreld_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s2, s0, 0x50
; GCN-NEXT: v_movreld_b32_e32 v1, v32
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v31, s3
; GCN-NEXT: v_mov_b32_e32 v30, s2
; GCN-NEXT: s_add_u32 s2, s0, 64
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v21, s3
; GCN-NEXT: v_mov_b32_e32 v20, s2
; GCN-NEXT: s_add_u32 s2, s0, 48
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v17, s3
; GCN-NEXT: v_mov_b32_e32 v16, s2
; GCN-NEXT: s_add_u32 s2, s0, 32
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v13, s3
; GCN-NEXT: v_mov_b32_e32 v12, s2
; GCN-NEXT: s_add_u32 s2, s0, 16
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v9, s3
; GCN-NEXT: v_mov_b32_e32 v8, s2
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_add_u32 s2, s0, 0x70
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_add_u32 s0, s0, 0x60
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29]
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: double15_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x114
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s6, s1
; GCN-O0-NEXT: s_mov_b32 s7, s0
; GCN-O0-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x104
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s8, s15
; GCN-O0-NEXT: s_mov_b32 s9, s14
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0xe4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s12, s27
; GCN-O0-NEXT: s_mov_b32 s13, s26
; GCN-O0-NEXT: s_mov_b32 s14, s25
; GCN-O0-NEXT: s_mov_b32 s15, s24
; GCN-O0-NEXT: s_mov_b32 s16, s23
; GCN-O0-NEXT: s_mov_b32 s17, s22
; GCN-O0-NEXT: s_mov_b32 s18, s21
; GCN-O0-NEXT: s_mov_b32 s19, s20
; GCN-O0-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s20, s51
; GCN-O0-NEXT: s_mov_b32 s21, s50
; GCN-O0-NEXT: s_mov_b32 s22, s49
; GCN-O0-NEXT: s_mov_b32 s23, s48
; GCN-O0-NEXT: s_mov_b32 s24, s47
; GCN-O0-NEXT: s_mov_b32 s25, s46
; GCN-O0-NEXT: s_mov_b32 s26, s45
; GCN-O0-NEXT: s_mov_b32 s27, s44
; GCN-O0-NEXT: s_mov_b32 s28, s43
; GCN-O0-NEXT: s_mov_b32 s29, s42
; GCN-O0-NEXT: s_mov_b32 s30, s41
; GCN-O0-NEXT: s_mov_b32 s31, s40
; GCN-O0-NEXT: s_mov_b32 s33, s39
; GCN-O0-NEXT: s_mov_b32 s34, s38
; GCN-O0-NEXT: s_mov_b32 s35, s37
; GCN-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
; GCN-O0-NEXT: ; implicit-def: $sgpr1
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr0
; GCN-O0-NEXT: ; implicit-def: $sgpr37
; GCN-O0-NEXT: v_mov_b32_e32 v7, s36
; GCN-O0-NEXT: v_mov_b32_e32 v62, s35
; GCN-O0-NEXT: v_mov_b32_e32 v61, s34
; GCN-O0-NEXT: v_mov_b32_e32 v60, s33
; GCN-O0-NEXT: v_mov_b32_e32 v59, s31
; GCN-O0-NEXT: v_mov_b32_e32 v58, s30
; GCN-O0-NEXT: v_mov_b32_e32 v57, s29
; GCN-O0-NEXT: v_mov_b32_e32 v56, s28
; GCN-O0-NEXT: v_mov_b32_e32 v55, s27
; GCN-O0-NEXT: v_mov_b32_e32 v54, s26
; GCN-O0-NEXT: v_mov_b32_e32 v53, s25
; GCN-O0-NEXT: v_mov_b32_e32 v52, s24
; GCN-O0-NEXT: v_mov_b32_e32 v51, s23
; GCN-O0-NEXT: v_mov_b32_e32 v50, s22
; GCN-O0-NEXT: v_mov_b32_e32 v49, s21
; GCN-O0-NEXT: v_mov_b32_e32 v48, s20
; GCN-O0-NEXT: v_mov_b32_e32 v47, s19
; GCN-O0-NEXT: v_mov_b32_e32 v46, s18
; GCN-O0-NEXT: v_mov_b32_e32 v45, s17
; GCN-O0-NEXT: v_mov_b32_e32 v44, s16
; GCN-O0-NEXT: v_mov_b32_e32 v43, s15
; GCN-O0-NEXT: v_mov_b32_e32 v42, s14
; GCN-O0-NEXT: v_mov_b32_e32 v41, s13
; GCN-O0-NEXT: v_mov_b32_e32 v40, s12
; GCN-O0-NEXT: v_mov_b32_e32 v39, s11
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v62
; GCN-O0-NEXT: v_mov_b32_e32 v9, v61
; GCN-O0-NEXT: v_mov_b32_e32 v10, v60
; GCN-O0-NEXT: v_mov_b32_e32 v11, v59
; GCN-O0-NEXT: v_mov_b32_e32 v12, v58
; GCN-O0-NEXT: v_mov_b32_e32 v13, v57
; GCN-O0-NEXT: v_mov_b32_e32 v14, v56
; GCN-O0-NEXT: v_mov_b32_e32 v15, v55
; GCN-O0-NEXT: v_mov_b32_e32 v16, v54
; GCN-O0-NEXT: v_mov_b32_e32 v17, v53
; GCN-O0-NEXT: v_mov_b32_e32 v18, v52
; GCN-O0-NEXT: v_mov_b32_e32 v19, v51
; GCN-O0-NEXT: v_mov_b32_e32 v20, v50
; GCN-O0-NEXT: v_mov_b32_e32 v21, v49
; GCN-O0-NEXT: v_mov_b32_e32 v22, v48
; GCN-O0-NEXT: v_mov_b32_e32 v23, v47
; GCN-O0-NEXT: v_mov_b32_e32 v24, v46
; GCN-O0-NEXT: v_mov_b32_e32 v25, v45
; GCN-O0-NEXT: v_mov_b32_e32 v26, v44
; GCN-O0-NEXT: v_mov_b32_e32 v27, v43
; GCN-O0-NEXT: v_mov_b32_e32 v28, v42
; GCN-O0-NEXT: v_mov_b32_e32 v29, v41
; GCN-O0-NEXT: v_mov_b32_e32 v30, v40
; GCN-O0-NEXT: v_mov_b32_e32 v31, v39
; GCN-O0-NEXT: v_mov_b32_e32 v32, v6
; GCN-O0-NEXT: v_mov_b32_e32 v33, v5
; GCN-O0-NEXT: v_mov_b32_e32 v34, v4
; GCN-O0-NEXT: v_mov_b32_e32 v35, v3
; GCN-O0-NEXT: v_mov_b32_e32 v36, v2
; GCN-O0-NEXT: v_mov_b32_e32 v37, v1
; GCN-O0-NEXT: v_mov_b32_e32 v38, v0
; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x124
; GCN-O0-NEXT: s_mov_b32 s1, 1
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_lshl_b32 s0, s0, s1
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v7, v0
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v0, s1
; GCN-O0-NEXT: s_mov_b32 m0, s0
; GCN-O0-NEXT: v_movreld_b32_e32 v8, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v34
; GCN-O0-NEXT: v_mov_b32_e32 v1, v33
; GCN-O0-NEXT: v_mov_b32_e32 v6, v32
; GCN-O0-NEXT: v_mov_b32_e32 v2, v31
; GCN-O0-NEXT: v_mov_b32_e32 v3, v26
; GCN-O0-NEXT: v_mov_b32_e32 v4, v25
; GCN-O0-NEXT: v_mov_b32_e32 v5, v24
; GCN-O0-NEXT: v_mov_b32_e32 v39, v23
; GCN-O0-NEXT: v_mov_b32_e32 v40, v30
; GCN-O0-NEXT: v_mov_b32_e32 v41, v29
; GCN-O0-NEXT: v_mov_b32_e32 v46, v28
; GCN-O0-NEXT: v_mov_b32_e32 v42, v27
; GCN-O0-NEXT: v_mov_b32_e32 v43, v10
; GCN-O0-NEXT: v_mov_b32_e32 v44, v9
; GCN-O0-NEXT: v_mov_b32_e32 v45, v8
; GCN-O0-NEXT: v_mov_b32_e32 v47, v7
; GCN-O0-NEXT: v_mov_b32_e32 v48, v14
; GCN-O0-NEXT: v_mov_b32_e32 v49, v13
; GCN-O0-NEXT: v_mov_b32_e32 v54, v12
; GCN-O0-NEXT: v_mov_b32_e32 v50, v11
; GCN-O0-NEXT: v_mov_b32_e32 v51, v18
; GCN-O0-NEXT: v_mov_b32_e32 v52, v17
; GCN-O0-NEXT: v_mov_b32_e32 v53, v16
; GCN-O0-NEXT: v_mov_b32_e32 v55, v15
; GCN-O0-NEXT: v_mov_b32_e32 v56, v22
; GCN-O0-NEXT: v_mov_b32_e32 v57, v21
; GCN-O0-NEXT: v_mov_b32_e32 v62, v20
; GCN-O0-NEXT: v_mov_b32_e32 v58, v19
; GCN-O0-NEXT: ; kill: def $vgpr58 killed $vgpr58 def $vgpr58_vgpr59_vgpr60_vgpr61 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v59, v62
; GCN-O0-NEXT: v_mov_b32_e32 v60, v57
; GCN-O0-NEXT: v_mov_b32_e32 v61, v56
; GCN-O0-NEXT: s_mov_b64 s[6:7], 48
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v57, s1
; GCN-O0-NEXT: v_mov_b32_e32 v56, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[56:57], v[58:61]
; GCN-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 def $vgpr55_vgpr56_vgpr57_vgpr58 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v56, v53
; GCN-O0-NEXT: v_mov_b32_e32 v57, v52
; GCN-O0-NEXT: v_mov_b32_e32 v58, v51
; GCN-O0-NEXT: s_mov_b64 s[6:7], 32
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v52, s1
; GCN-O0-NEXT: v_mov_b32_e32 v51, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[51:52], v[55:58]
; GCN-O0-NEXT: ; kill: def $vgpr50 killed $vgpr50 def $vgpr50_vgpr51_vgpr52_vgpr53 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v51, v54
; GCN-O0-NEXT: v_mov_b32_e32 v52, v49
; GCN-O0-NEXT: v_mov_b32_e32 v53, v48
; GCN-O0-NEXT: s_mov_b64 s[6:7], 16
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v49, s1
; GCN-O0-NEXT: v_mov_b32_e32 v48, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[48:49], v[50:53]
; GCN-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 def $vgpr47_vgpr48_vgpr49_vgpr50 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v48, v45
; GCN-O0-NEXT: v_mov_b32_e32 v49, v44
; GCN-O0-NEXT: v_mov_b32_e32 v50, v43
; GCN-O0-NEXT: v_mov_b32_e32 v44, s3
; GCN-O0-NEXT: v_mov_b32_e32 v43, s2
; GCN-O0-NEXT: flat_store_dwordx4 v[43:44], v[47:50]
; GCN-O0-NEXT: ; kill: def $vgpr42 killed $vgpr42 def $vgpr42_vgpr43_vgpr44_vgpr45 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v43, v46
; GCN-O0-NEXT: v_mov_b32_e32 v44, v41
; GCN-O0-NEXT: v_mov_b32_e32 v45, v40
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x50
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v41, s1
; GCN-O0-NEXT: v_mov_b32_e32 v40, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[40:41], v[42:45]
; GCN-O0-NEXT: ; kill: def $vgpr39 killed $vgpr39 def $vgpr39_vgpr40_vgpr41_vgpr42 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v40, v5
; GCN-O0-NEXT: v_mov_b32_e32 v41, v4
; GCN-O0-NEXT: v_mov_b32_e32 v42, v3
; GCN-O0-NEXT: s_mov_b64 s[6:7], 64
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v4, s1
; GCN-O0-NEXT: v_mov_b32_e32 v3, s0
; GCN-O0-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; GCN-O0-NEXT: v_mov_b32_e32 v3, v36
; GCN-O0-NEXT: v_mov_b32_e32 v7, v35
; GCN-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v8, v3
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0x70
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s5, s6
; GCN-O0-NEXT: s_mov_b32 s4, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s5
; GCN-O0-NEXT: s_addc_u32 s4, s1, s4
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s4
; GCN-O0-NEXT: v_mov_b32_e32 v4, s1
; GCN-O0-NEXT: v_mov_b32_e32 v3, s0
; GCN-O0-NEXT: flat_store_dwordx2 v[3:4], v[7:8]
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v3, v6
; GCN-O0-NEXT: v_mov_b32_e32 v4, v1
; GCN-O0-NEXT: v_mov_b32_e32 v5, v0
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0x60
; GCN-O0-NEXT: s_mov_b32 s0, s2
; GCN-O0-NEXT: s_mov_b32 s1, s3
; GCN-O0-NEXT: s_mov_b32 s3, s4
; GCN-O0-NEXT: s_mov_b32 s2, s5
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
store <15 x double> %v, ptr addrspace(1) %out
ret void
}
; FIXME: Fold out s_or_b32 s2, 0, s3
define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) {
; GCN-LABEL: bit4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: s_mov_b32 s15, 0xe80000
; GCN-NEXT: s_add_u32 s12, s12, s11
; GCN-NEXT: s_addc_u32 s13, s13, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s6, s2, 0x10003
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_bfe_u32 s5, s2, 0x20002
; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_bfe_u32 s4, s2, 0x10001
; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: s_and_b32 s3, s3, 3
; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_or_b32_e64 v1, s3, 0
; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen
; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0
; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1
; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2
; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_lshlrev_b16_e32 v1, 1, v1
; GCN-NEXT: v_lshlrev_b16_e32 v2, 2, v2
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v2, 15, v0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_byte v[0:1], v2
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: bit4_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe80000
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-O0-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-O0-NEXT: s_load_dword s2, s[2:3], 0x30
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_bfe_u32 s3, s4, 0x10001
; GCN-O0-NEXT: s_bfe_u32 s5, s4, 0x20002
; GCN-O0-NEXT: s_bfe_u32 s6, s4, 0x10003
; GCN-O0-NEXT: s_mov_b32 s7, 3
; GCN-O0-NEXT: s_and_b32 s7, s2, s7
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_or_b32 s2, s2, s7
; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3
; GCN-O0-NEXT: v_mov_b32_e32 v0, s5
; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0
; GCN-O0-NEXT: v_mov_b32_e32 v0, s3
; GCN-O0-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1
; GCN-O0-NEXT: v_mov_b32_e32 v3, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: buffer_store_byte v3, v0, s[12:15], 0 offen
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[12:15], 0
; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[12:15], 0 offset:1
; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2
; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:3
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, v3, v4
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: s_mov_b32 s2, 2
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s2, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_mov_b32 s2, 3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: s_mov_b32 s2, 15
; GCN-O0-NEXT: v_and_b32_e64 v2, v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <4 x i1> %vec, i1 1, i32 %sel
store <4 x i1> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) {
; GCN-LABEL: bit128_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: s_load_dword s6, s[4:5], 0x44
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s9, s0, 0xf0001
; GCN-NEXT: s_lshr_b32 s42, s1, 16
; GCN-NEXT: v_writelane_b32 v6, s4, 0
; GCN-NEXT: v_writelane_b32 v6, s5, 1
; GCN-NEXT: s_lshr_b32 s4, s0, 16
; GCN-NEXT: v_writelane_b32 v6, s4, 2
; GCN-NEXT: s_lshr_b32 s4, s0, 17
; GCN-NEXT: v_writelane_b32 v6, s4, 3
; GCN-NEXT: s_lshr_b32 s4, s0, 18
; GCN-NEXT: v_writelane_b32 v6, s4, 4
; GCN-NEXT: s_lshr_b32 s4, s0, 19
; GCN-NEXT: v_writelane_b32 v6, s4, 5
; GCN-NEXT: s_lshr_b32 s4, s0, 20
; GCN-NEXT: v_writelane_b32 v6, s4, 6
; GCN-NEXT: s_lshr_b32 s4, s0, 21
; GCN-NEXT: v_writelane_b32 v6, s4, 7
; GCN-NEXT: s_lshr_b32 s4, s0, 22
; GCN-NEXT: v_writelane_b32 v6, s4, 8
; GCN-NEXT: s_lshr_b32 s4, s0, 23
; GCN-NEXT: v_writelane_b32 v6, s4, 9
; GCN-NEXT: s_lshr_b32 s4, s0, 24
; GCN-NEXT: v_writelane_b32 v6, s4, 10
; GCN-NEXT: s_lshr_b32 s4, s0, 25
; GCN-NEXT: v_writelane_b32 v6, s4, 11
; GCN-NEXT: s_lshr_b32 s4, s0, 26
; GCN-NEXT: v_writelane_b32 v6, s4, 12
; GCN-NEXT: s_lshr_b32 s4, s0, 27
; GCN-NEXT: v_writelane_b32 v6, s4, 13
; GCN-NEXT: s_lshr_b32 s4, s0, 28
; GCN-NEXT: v_writelane_b32 v6, s4, 14
; GCN-NEXT: s_lshr_b32 s4, s0, 29
; GCN-NEXT: v_writelane_b32 v6, s4, 15
; GCN-NEXT: s_lshr_b32 s4, s0, 30
; GCN-NEXT: v_writelane_b32 v6, s4, 16
; GCN-NEXT: s_lshr_b32 s4, s0, 31
; GCN-NEXT: v_writelane_b32 v6, s4, 17
; GCN-NEXT: v_writelane_b32 v6, s9, 18
; GCN-NEXT: s_bfe_u32 s9, s0, 0xe0002
; GCN-NEXT: v_writelane_b32 v6, s9, 19
; GCN-NEXT: s_bfe_u32 s9, s0, 0xd0003
; GCN-NEXT: v_writelane_b32 v6, s9, 20
; GCN-NEXT: s_bfe_u32 s9, s0, 0xc0004
; GCN-NEXT: v_writelane_b32 v6, s9, 21
; GCN-NEXT: s_bfe_u32 s9, s0, 0xb0005
; GCN-NEXT: v_writelane_b32 v6, s9, 22
; GCN-NEXT: s_bfe_u32 s9, s0, 0xa0006
; GCN-NEXT: v_writelane_b32 v6, s9, 23
; GCN-NEXT: s_bfe_u32 s9, s0, 0x90007
; GCN-NEXT: v_writelane_b32 v6, s9, 24
; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008
; GCN-NEXT: v_writelane_b32 v6, s9, 25
; GCN-NEXT: s_bfe_u32 s9, s0, 0x70009
; GCN-NEXT: v_writelane_b32 v6, s9, 26
; GCN-NEXT: s_bfe_u32 s9, s0, 0x6000a
; GCN-NEXT: v_writelane_b32 v6, s9, 27
; GCN-NEXT: s_bfe_u32 s9, s0, 0x5000b
; GCN-NEXT: v_writelane_b32 v6, s9, 28
; GCN-NEXT: s_bfe_u32 s9, s0, 0x4000c
; GCN-NEXT: v_writelane_b32 v6, s9, 29
; GCN-NEXT: s_bfe_u32 s9, s0, 0x3000d
; GCN-NEXT: v_writelane_b32 v6, s9, 30
; GCN-NEXT: s_bfe_u32 s9, s0, 0x2000e
; GCN-NEXT: v_writelane_b32 v6, s9, 31
; GCN-NEXT: s_bfe_u32 s9, s0, 0x1000f
; GCN-NEXT: v_writelane_b32 v6, s9, 32
; GCN-NEXT: s_bfe_u32 s9, s1, 0xf0001
; GCN-NEXT: s_lshr_b32 s43, s1, 17
; GCN-NEXT: s_lshr_b32 s45, s1, 18
; GCN-NEXT: s_lshr_b32 s47, s1, 19
; GCN-NEXT: s_lshr_b32 s50, s1, 20
; GCN-NEXT: s_lshr_b32 s51, s1, 21
; GCN-NEXT: s_lshr_b32 s53, s1, 22
; GCN-NEXT: s_lshr_b32 s55, s1, 23
; GCN-NEXT: s_lshr_b32 s58, s1, 24
; GCN-NEXT: s_lshr_b32 s59, s1, 25
; GCN-NEXT: s_lshr_b32 s61, s1, 26
; GCN-NEXT: s_lshr_b32 s63, s1, 27
; GCN-NEXT: s_lshr_b32 s66, s1, 28
; GCN-NEXT: s_lshr_b32 s67, s1, 29
; GCN-NEXT: s_lshr_b32 s68, s1, 30
; GCN-NEXT: s_lshr_b32 s69, s1, 31
; GCN-NEXT: s_lshr_b32 s73, s2, 16
; GCN-NEXT: s_lshr_b32 s74, s2, 17
; GCN-NEXT: s_lshr_b32 s77, s2, 18
; GCN-NEXT: s_lshr_b32 s78, s2, 19
; GCN-NEXT: s_lshr_b32 s81, s2, 20
; GCN-NEXT: s_lshr_b32 s82, s2, 21
; GCN-NEXT: s_lshr_b32 s84, s2, 22
; GCN-NEXT: s_lshr_b32 s86, s2, 23
; GCN-NEXT: s_lshr_b32 s89, s2, 24
; GCN-NEXT: s_lshr_b32 s90, s2, 25
; GCN-NEXT: s_lshr_b32 s93, s2, 26
; GCN-NEXT: s_lshr_b32 s94, s2, 27
; GCN-NEXT: s_lshr_b32 vcc_hi, s2, 28
; GCN-NEXT: s_lshr_b32 s39, s2, 29
; GCN-NEXT: s_lshr_b32 s38, s2, 30
; GCN-NEXT: s_lshr_b32 s37, s2, 31
; GCN-NEXT: s_lshr_b32 s33, s3, 16
; GCN-NEXT: s_lshr_b32 s31, s3, 17
; GCN-NEXT: s_lshr_b32 s28, s3, 18
; GCN-NEXT: s_lshr_b32 s27, s3, 19
; GCN-NEXT: s_lshr_b32 s24, s3, 20
; GCN-NEXT: s_lshr_b32 s23, s3, 21
; GCN-NEXT: s_lshr_b32 s20, s3, 22
; GCN-NEXT: s_lshr_b32 s19, s3, 23
; GCN-NEXT: s_lshr_b32 s16, s3, 24
; GCN-NEXT: s_lshr_b32 s15, s3, 25
; GCN-NEXT: s_lshr_b32 s12, s3, 26
; GCN-NEXT: s_lshr_b32 s11, s3, 27
; GCN-NEXT: s_lshr_b32 s8, s3, 28
; GCN-NEXT: s_lshr_b32 s7, s3, 29
; GCN-NEXT: s_lshr_b32 s5, s3, 30
; GCN-NEXT: s_lshr_b32 s4, s3, 31
; GCN-NEXT: v_writelane_b32 v6, s9, 33
; GCN-NEXT: s_bfe_u32 s40, s1, 0xe0002
; GCN-NEXT: s_bfe_u32 s41, s1, 0xd0003
; GCN-NEXT: s_bfe_u32 s44, s1, 0xc0004
; GCN-NEXT: s_bfe_u32 s46, s1, 0xb0005
; GCN-NEXT: s_bfe_u32 s48, s1, 0xa0006
; GCN-NEXT: s_bfe_u32 s49, s1, 0x90007
; GCN-NEXT: s_bfe_u32 s52, s1, 0x80008
; GCN-NEXT: s_bfe_u32 s54, s1, 0x70009
; GCN-NEXT: s_bfe_u32 s56, s1, 0x6000a
; GCN-NEXT: s_bfe_u32 s57, s1, 0x5000b
; GCN-NEXT: s_bfe_u32 s60, s1, 0x4000c
; GCN-NEXT: s_bfe_u32 s62, s1, 0x3000d
; GCN-NEXT: s_bfe_u32 s64, s1, 0x2000e
; GCN-NEXT: s_bfe_u32 s65, s1, 0x1000f
; GCN-NEXT: s_bfe_u32 s70, s2, 0xf0001
; GCN-NEXT: s_bfe_u32 s71, s2, 0xe0002
; GCN-NEXT: s_bfe_u32 s72, s2, 0xd0003
; GCN-NEXT: s_bfe_u32 s75, s2, 0xc0004
; GCN-NEXT: s_bfe_u32 s76, s2, 0xb0005
; GCN-NEXT: s_bfe_u32 s79, s2, 0xa0006
; GCN-NEXT: s_bfe_u32 s80, s2, 0x90007
; GCN-NEXT: s_bfe_u32 s83, s2, 0x80008
; GCN-NEXT: s_bfe_u32 s85, s2, 0x70009
; GCN-NEXT: s_bfe_u32 s87, s2, 0x6000a
; GCN-NEXT: s_bfe_u32 s88, s2, 0x5000b
; GCN-NEXT: s_bfe_u32 s91, s2, 0x4000c
; GCN-NEXT: s_bfe_u32 s92, s2, 0x3000d
; GCN-NEXT: s_bfe_u32 s95, s2, 0x2000e
; GCN-NEXT: s_bfe_u32 vcc_lo, s2, 0x1000f
; GCN-NEXT: s_bfe_u32 s36, s3, 0xf0001
; GCN-NEXT: s_bfe_u32 s35, s3, 0xe0002
; GCN-NEXT: s_bfe_u32 s34, s3, 0xd0003
; GCN-NEXT: s_bfe_u32 s30, s3, 0xc0004
; GCN-NEXT: s_bfe_u32 s29, s3, 0xb0005
; GCN-NEXT: s_bfe_u32 s26, s3, 0xa0006
; GCN-NEXT: s_bfe_u32 s25, s3, 0x90007
; GCN-NEXT: s_bfe_u32 s22, s3, 0x80008
; GCN-NEXT: s_bfe_u32 s21, s3, 0x70009
; GCN-NEXT: s_bfe_u32 s18, s3, 0x6000a
; GCN-NEXT: s_bfe_u32 s17, s3, 0x5000b
; GCN-NEXT: s_bfe_u32 s14, s3, 0x4000c
; GCN-NEXT: s_bfe_u32 s13, s3, 0x3000d
; GCN-NEXT: s_bfe_u32 s10, s3, 0x2000e
; GCN-NEXT: s_bfe_u32 s9, s3, 0x1000f
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7f
; GCN-NEXT: s_cselect_b32 s4, s4, 1
; GCN-NEXT: s_lshl_b32 s4, s4, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7e
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7d
; GCN-NEXT: s_cselect_b32 s5, s7, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7c
; GCN-NEXT: s_cselect_b32 s7, s8, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 3
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7b
; GCN-NEXT: s_cselect_b32 s5, s11, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7a
; GCN-NEXT: s_cselect_b32 s7, s12, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x79
; GCN-NEXT: s_cselect_b32 s7, s15, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x78
; GCN-NEXT: s_cselect_b32 s8, s16, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 15
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x77
; GCN-NEXT: s_cselect_b32 s5, s19, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x76
; GCN-NEXT: s_cselect_b32 s7, s20, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x75
; GCN-NEXT: s_cselect_b32 s7, s23, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x74
; GCN-NEXT: s_cselect_b32 s8, s24, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x73
; GCN-NEXT: s_cselect_b32 s7, s27, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x72
; GCN-NEXT: s_cselect_b32 s8, s28, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x71
; GCN-NEXT: s_cselect_b32 s8, s31, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x70
; GCN-NEXT: s_cselect_b32 s11, s33, 1
; GCN-NEXT: s_and_b32 s11, s11, 1
; GCN-NEXT: s_or_b32 s8, s11, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 0xff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6f
; GCN-NEXT: s_cselect_b32 s5, s9, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6e
; GCN-NEXT: s_cselect_b32 s7, s10, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6d
; GCN-NEXT: s_cselect_b32 s7, s13, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6c
; GCN-NEXT: s_cselect_b32 s8, s14, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6b
; GCN-NEXT: s_cselect_b32 s7, s17, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6a
; GCN-NEXT: s_cselect_b32 s8, s18, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x69
; GCN-NEXT: s_cselect_b32 s8, s21, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x68
; GCN-NEXT: s_cselect_b32 s9, s22, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x67
; GCN-NEXT: s_cselect_b32 s7, s25, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x66
; GCN-NEXT: s_cselect_b32 s8, s26, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x65
; GCN-NEXT: s_cselect_b32 s8, s29, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x64
; GCN-NEXT: s_cselect_b32 s9, s30, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x63
; GCN-NEXT: s_cselect_b32 s8, s34, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x62
; GCN-NEXT: s_cselect_b32 s9, s35, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 2
; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x60
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x61
; GCN-NEXT: s_cselect_b32 s9, s36, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s3, s3, s9
; GCN-NEXT: s_and_b32 s3, s3, 3
; GCN-NEXT: s_or_b32 s3, s3, s8
; GCN-NEXT: s_and_b32 s3, s3, 15
; GCN-NEXT: s_or_b32 s3, s3, s7
; GCN-NEXT: s_and_b32 s3, s3, 0xff
; GCN-NEXT: s_or_b32 s3, s3, s5
; GCN-NEXT: s_and_b32 s3, s3, 0xffff
; GCN-NEXT: s_or_b32 s3, s3, s4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5f
; GCN-NEXT: s_cselect_b32 s4, s37, 1
; GCN-NEXT: s_lshl_b32 s4, s4, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5e
; GCN-NEXT: s_cselect_b32 s5, s38, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5d
; GCN-NEXT: s_cselect_b32 s5, s39, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5c
; GCN-NEXT: s_cselect_b32 s7, vcc_hi, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 3
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5b
; GCN-NEXT: s_cselect_b32 s5, s94, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5a
; GCN-NEXT: s_cselect_b32 s7, s93, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x59
; GCN-NEXT: s_cselect_b32 s7, s90, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x58
; GCN-NEXT: s_cselect_b32 s8, s89, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 15
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x57
; GCN-NEXT: s_cselect_b32 s5, s86, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x56
; GCN-NEXT: s_cselect_b32 s7, s84, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x55
; GCN-NEXT: s_cselect_b32 s7, s82, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x54
; GCN-NEXT: s_cselect_b32 s8, s81, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x53
; GCN-NEXT: s_cselect_b32 s7, s78, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x52
; GCN-NEXT: s_cselect_b32 s8, s77, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x51
; GCN-NEXT: s_cselect_b32 s8, s74, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x50
; GCN-NEXT: s_cselect_b32 s9, s73, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 0xff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4f
; GCN-NEXT: s_cselect_b32 s5, vcc_lo, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4e
; GCN-NEXT: s_cselect_b32 s7, s95, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4d
; GCN-NEXT: s_cselect_b32 s7, s92, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4c
; GCN-NEXT: s_cselect_b32 s8, s91, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 12
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4b
; GCN-NEXT: s_cselect_b32 s7, s88, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4a
; GCN-NEXT: s_cselect_b32 s8, s87, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x49
; GCN-NEXT: s_cselect_b32 s8, s85, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x48
; GCN-NEXT: s_cselect_b32 s9, s83, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x47
; GCN-NEXT: s_cselect_b32 s7, s80, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x46
; GCN-NEXT: s_cselect_b32 s8, s79, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x45
; GCN-NEXT: s_cselect_b32 s8, s76, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x44
; GCN-NEXT: s_cselect_b32 s9, s75, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 4
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x43
; GCN-NEXT: s_cselect_b32 s8, s72, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x42
; GCN-NEXT: s_cselect_b32 s9, s71, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 2
; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 64
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmpk_lg_i32 s6, 0x41
; GCN-NEXT: s_cselect_b32 s9, s70, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s2, s2, s9
; GCN-NEXT: s_and_b32 s2, s2, 3
; GCN-NEXT: s_or_b32 s2, s2, s8
; GCN-NEXT: s_and_b32 s2, s2, 15
; GCN-NEXT: s_or_b32 s2, s2, s7
; GCN-NEXT: s_and_b32 s2, s2, 0xff
; GCN-NEXT: s_or_b32 s2, s2, s5
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
; GCN-NEXT: s_or_b32 s2, s2, s4
; GCN-NEXT: s_cmp_lg_u32 s6, 63
; GCN-NEXT: s_cselect_b32 s4, s69, 1
; GCN-NEXT: s_lshl_b32 s4, s4, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 62
; GCN-NEXT: s_cselect_b32 s5, s68, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmp_lg_u32 s6, 61
; GCN-NEXT: s_cselect_b32 s5, s67, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 60
; GCN-NEXT: s_cselect_b32 s7, s66, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 3
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 59
; GCN-NEXT: s_cselect_b32 s5, s63, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 58
; GCN-NEXT: s_cselect_b32 s7, s61, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 57
; GCN-NEXT: s_cselect_b32 s7, s59, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 56
; GCN-NEXT: s_cselect_b32 s8, s58, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 15
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmp_lg_u32 s6, 55
; GCN-NEXT: s_cselect_b32 s5, s55, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 54
; GCN-NEXT: s_cselect_b32 s7, s53, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 53
; GCN-NEXT: s_cselect_b32 s7, s51, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 52
; GCN-NEXT: s_cselect_b32 s8, s50, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 51
; GCN-NEXT: s_cselect_b32 s7, s47, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 50
; GCN-NEXT: s_cselect_b32 s8, s45, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 49
; GCN-NEXT: s_cselect_b32 s8, s43, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 48
; GCN-NEXT: s_cselect_b32 s9, s42, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 0xff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 47
; GCN-NEXT: s_cselect_b32 s5, s65, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 46
; GCN-NEXT: s_cselect_b32 s7, s64, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 45
; GCN-NEXT: s_cselect_b32 s7, s62, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 44
; GCN-NEXT: s_cselect_b32 s8, s60, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 43
; GCN-NEXT: s_cselect_b32 s7, s57, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 42
; GCN-NEXT: s_cselect_b32 s8, s56, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 41
; GCN-NEXT: s_cselect_b32 s8, s54, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 40
; GCN-NEXT: s_cselect_b32 s9, s52, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 39
; GCN-NEXT: s_cselect_b32 s7, s49, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 38
; GCN-NEXT: s_cselect_b32 s8, s48, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 37
; GCN-NEXT: s_cselect_b32 s8, s46, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 36
; GCN-NEXT: s_cselect_b32 s9, s44, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 35
; GCN-NEXT: s_cselect_b32 s8, s41, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 34
; GCN-NEXT: s_cselect_b32 s9, s40, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 2
; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 32
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_and_b32 s1, s1, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 33
; GCN-NEXT: v_readlane_b32 s9, v6, 33
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s1, s1, s9
; GCN-NEXT: s_and_b32 s1, s1, 3
; GCN-NEXT: s_or_b32 s1, s1, s8
; GCN-NEXT: s_and_b32 s1, s1, 15
; GCN-NEXT: s_or_b32 s1, s1, s7
; GCN-NEXT: s_and_b32 s1, s1, 0xff
; GCN-NEXT: s_or_b32 s1, s1, s5
; GCN-NEXT: s_and_b32 s1, s1, 0xffff
; GCN-NEXT: s_or_b32 s1, s1, s4
; GCN-NEXT: s_cmp_lg_u32 s6, 31
; GCN-NEXT: v_readlane_b32 s4, v6, 17
; GCN-NEXT: s_cselect_b32 s4, s4, 1
; GCN-NEXT: s_lshl_b32 s4, s4, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 30
; GCN-NEXT: v_readlane_b32 s5, v6, 16
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmp_lg_u32 s6, 29
; GCN-NEXT: v_readlane_b32 s5, v6, 15
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 28
; GCN-NEXT: v_readlane_b32 s7, v6, 14
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 3
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 27
; GCN-NEXT: v_readlane_b32 s5, v6, 13
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 26
; GCN-NEXT: v_readlane_b32 s7, v6, 12
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 25
; GCN-NEXT: v_readlane_b32 s7, v6, 11
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 24
; GCN-NEXT: v_readlane_b32 s8, v6, 10
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 15
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_cmp_lg_u32 s6, 23
; GCN-NEXT: v_readlane_b32 s5, v6, 9
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 22
; GCN-NEXT: v_readlane_b32 s7, v6, 8
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 21
; GCN-NEXT: v_readlane_b32 s7, v6, 7
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 20
; GCN-NEXT: v_readlane_b32 s8, v6, 6
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 19
; GCN-NEXT: v_readlane_b32 s7, v6, 5
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 18
; GCN-NEXT: v_readlane_b32 s8, v6, 4
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 17
; GCN-NEXT: v_readlane_b32 s8, v6, 3
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 16
; GCN-NEXT: v_readlane_b32 s9, v6, 2
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_and_b32 s5, s5, 0xff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_lshl_b32 s4, s4, 16
; GCN-NEXT: s_cmp_lg_u32 s6, 15
; GCN-NEXT: v_readlane_b32 s5, v6, 32
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 14
; GCN-NEXT: v_readlane_b32 s7, v6, 31
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_and_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 13
; GCN-NEXT: v_readlane_b32 s7, v6, 30
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 12
; GCN-NEXT: v_readlane_b32 s8, v6, 29
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 3
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_lshl_b32 s5, s5, 12
; GCN-NEXT: s_cmp_lg_u32 s6, 11
; GCN-NEXT: v_readlane_b32 s7, v6, 28
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 10
; GCN-NEXT: v_readlane_b32 s8, v6, 27
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 9
; GCN-NEXT: v_readlane_b32 s8, v6, 26
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 8
; GCN-NEXT: v_readlane_b32 s9, v6, 25
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_and_b32 s7, s7, 15
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_cmp_lg_u32 s6, 7
; GCN-NEXT: v_readlane_b32 s7, v6, 24
; GCN-NEXT: s_cselect_b32 s7, s7, 1
; GCN-NEXT: s_lshl_b32 s7, s7, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 6
; GCN-NEXT: v_readlane_b32 s8, v6, 23
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_and_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 2
; GCN-NEXT: s_or_b32 s7, s7, s8
; GCN-NEXT: s_cmp_lg_u32 s6, 5
; GCN-NEXT: v_readlane_b32 s8, v6, 22
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 4
; GCN-NEXT: v_readlane_b32 s9, v6, 21
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s8, s9, s8
; GCN-NEXT: s_and_b32 s8, s8, 3
; GCN-NEXT: s_or_b32 s7, s8, s7
; GCN-NEXT: s_lshl_b32 s7, s7, 4
; GCN-NEXT: s_cmp_lg_u32 s6, 3
; GCN-NEXT: v_readlane_b32 s8, v6, 20
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 3
; GCN-NEXT: s_cmp_lg_u32 s6, 2
; GCN-NEXT: v_readlane_b32 s9, v6, 19
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 2
; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s0, s0, 1
; GCN-NEXT: s_and_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 1
; GCN-NEXT: v_readlane_b32 s6, v6, 18
; GCN-NEXT: s_cselect_b32 s6, s6, 1
; GCN-NEXT: s_lshl_b32 s6, s6, 1
; GCN-NEXT: s_or_b32 s0, s0, s6
; GCN-NEXT: s_and_b32 s0, s0, 3
; GCN-NEXT: s_or_b32 s0, s0, s8
; GCN-NEXT: s_and_b32 s0, s0, 15
; GCN-NEXT: s_or_b32 s0, s0, s7
; GCN-NEXT: s_and_b32 s0, s0, 0xff
; GCN-NEXT: s_or_b32 s0, s0, s5
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_readlane_b32 s0, v6, 0
; GCN-NEXT: v_readlane_b32 s1, v6, 1
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
; GCN-O0-LABEL: bit128_inselt:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s18, -1
; GCN-O0-NEXT: s_mov_b32 s19, 0xe80000
; GCN-O0-NEXT: s_add_u32 s16, s16, s11
; GCN-O0-NEXT: s_addc_u32 s17, s17, 0
; GCN-O0-NEXT: s_mov_b64 s[6:7], 52
; GCN-O0-NEXT: s_mov_b32 s0, s4
; GCN-O0-NEXT: s_mov_b32 s1, s5
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: s_mov_b32 s2, s7
; GCN-O0-NEXT: s_add_u32 s0, s0, s3
; GCN-O0-NEXT: s_addc_u32 s2, s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_mov_b32 s1, 1
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:388 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v2, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v3, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v4, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v5, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v6, v0, 6, 1
; GCN-O0-NEXT: s_mov_b32 s0, 7
; GCN-O0-NEXT: v_lshrrev_b32_e64 v7, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 53
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v8, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v9, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v10, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v11, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v12, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v13, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v14, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v15, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 54
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v16, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v17, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v18, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v19, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v20, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v21, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v22, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v23, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 55
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v24, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v25, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v26, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v27, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v28, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v29, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v30, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v31, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 56
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v32, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v33, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v34, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v35, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v36, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v37, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v38, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v39, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 57
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v40, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v41, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v42, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v43, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v44, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v45, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v46, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v47, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 58
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v48, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v49, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v50, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v51, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v52, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v53, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v54, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v55, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 59
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v56, v0, s1
; GCN-O0-NEXT: v_bfe_u32 v57, v0, 1, 1
; GCN-O0-NEXT: v_bfe_u32 v58, v0, 2, 1
; GCN-O0-NEXT: v_bfe_u32 v59, v0, 3, 1
; GCN-O0-NEXT: v_bfe_u32 v60, v0, 4, 1
; GCN-O0-NEXT: v_bfe_u32 v61, v0, 5, 1
; GCN-O0-NEXT: v_bfe_u32 v62, v0, 6, 1
; GCN-O0-NEXT: v_lshrrev_b32_e64 v63, s0, v0
; GCN-O0-NEXT: s_mov_b64 s[8:9], 60
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:392 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:396 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:400 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:404 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:408 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:412 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:416 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 61
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:424 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:428 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:432 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:436 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:440 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:444 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:448 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 62
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:456 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:460 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:464 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:468 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:472 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:476 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:480 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 63
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:488 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:492 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:496 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:500 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:504 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:508 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:512 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 64
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:520 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:524 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:528 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:532 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:536 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:540 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:544 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x41
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:552 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:556 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:560 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:564 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:568 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:572 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:576 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x42
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v0, s1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:584 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:588 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:592 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:596 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:600 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:604 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_bfe_u32 v1, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:608 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[8:9], 0x43
; GCN-O0-NEXT: s_mov_b32 s2, s4
; GCN-O0-NEXT: s_mov_b32 s3, s5
; GCN-O0-NEXT: s_mov_b32 s7, s8
; GCN-O0-NEXT: s_mov_b32 s6, s9
; GCN-O0-NEXT: s_add_u32 s2, s2, s7
; GCN-O0-NEXT: s_addc_u32 s6, s3, s6
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b32 s3, s6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
; GCN-O0-NEXT: flat_load_ubyte v0, v[0:1]
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:648 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, s1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 1, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 2, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 3, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 4, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 5, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_bfe_u32 v0, v0, 6, 1
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:644 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshrrev_b32_e64 v0, s0, v0
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x44
; GCN-O0-NEXT: s_mov_b32 s3, 0x7f
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_and_b32 s3, s2, s3
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_add_i32 s2, s2, s3
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:127
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:640 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:126
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:636 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:125
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:632 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:124
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:628 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:123
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:624 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:122
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:620 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:121
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:616 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:120
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:612 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:119
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:608 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:118
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:604 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:117
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:600 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:116
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:596 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:115
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:592 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:114
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:588 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:113
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:584 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:112
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:580 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:111
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:576 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:110
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:572 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:568 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:108
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:564 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:107
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:560 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:106
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:556 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:105
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:552 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:104
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:548 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:103
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:544 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:102
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:540 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:101
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:536 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:100
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:532 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:99
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:528 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:98
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:524 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:97
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:520 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:96
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:516 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:95
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:512 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:94
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:508 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:93
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:504 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:92
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:500 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:91
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:496 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:90
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:492 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:89
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:488 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:88
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:484 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:87
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:480 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:86
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:476 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:85
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:472 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:84
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:468 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:83
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:464 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:82
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:460 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:81
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:456 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:80
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:452 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:79
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:448 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:78
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:444 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:77
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:440 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:76
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:436 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:75
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:432 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:74
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:428 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:73
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:424 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:72
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:420 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:71
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:416 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:70
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:412 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:69
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:408 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:68
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:404 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:67
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:400 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:66
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:396 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:65
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:392 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0 offset:64
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:388 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_store_byte v63, off, s[16:19], 0 offset:63
; GCN-O0-NEXT: buffer_store_byte v62, off, s[16:19], 0 offset:62
; GCN-O0-NEXT: buffer_store_byte v61, off, s[16:19], 0 offset:61
; GCN-O0-NEXT: buffer_store_byte v60, off, s[16:19], 0 offset:60
; GCN-O0-NEXT: buffer_store_byte v59, off, s[16:19], 0 offset:59
; GCN-O0-NEXT: buffer_store_byte v58, off, s[16:19], 0 offset:58
; GCN-O0-NEXT: buffer_store_byte v57, off, s[16:19], 0 offset:57
; GCN-O0-NEXT: buffer_store_byte v56, off, s[16:19], 0 offset:56
; GCN-O0-NEXT: buffer_store_byte v55, off, s[16:19], 0 offset:55
; GCN-O0-NEXT: buffer_store_byte v54, off, s[16:19], 0 offset:54
; GCN-O0-NEXT: buffer_store_byte v53, off, s[16:19], 0 offset:53
; GCN-O0-NEXT: buffer_store_byte v52, off, s[16:19], 0 offset:52
; GCN-O0-NEXT: buffer_store_byte v51, off, s[16:19], 0 offset:51
; GCN-O0-NEXT: buffer_store_byte v50, off, s[16:19], 0 offset:50
; GCN-O0-NEXT: buffer_store_byte v49, off, s[16:19], 0 offset:49
; GCN-O0-NEXT: buffer_store_byte v48, off, s[16:19], 0 offset:48
; GCN-O0-NEXT: buffer_store_byte v47, off, s[16:19], 0 offset:47
; GCN-O0-NEXT: buffer_store_byte v46, off, s[16:19], 0 offset:46
; GCN-O0-NEXT: buffer_store_byte v45, off, s[16:19], 0 offset:45
; GCN-O0-NEXT: buffer_store_byte v44, off, s[16:19], 0 offset:44
; GCN-O0-NEXT: buffer_store_byte v43, off, s[16:19], 0 offset:43
; GCN-O0-NEXT: buffer_store_byte v42, off, s[16:19], 0 offset:42
; GCN-O0-NEXT: buffer_store_byte v41, off, s[16:19], 0 offset:41
; GCN-O0-NEXT: buffer_store_byte v40, off, s[16:19], 0 offset:40
; GCN-O0-NEXT: buffer_store_byte v39, off, s[16:19], 0 offset:39
; GCN-O0-NEXT: buffer_store_byte v38, off, s[16:19], 0 offset:38
; GCN-O0-NEXT: buffer_store_byte v37, off, s[16:19], 0 offset:37
; GCN-O0-NEXT: buffer_store_byte v36, off, s[16:19], 0 offset:36
; GCN-O0-NEXT: buffer_store_byte v35, off, s[16:19], 0 offset:35
; GCN-O0-NEXT: buffer_store_byte v34, off, s[16:19], 0 offset:34
; GCN-O0-NEXT: buffer_store_byte v33, off, s[16:19], 0 offset:33
; GCN-O0-NEXT: buffer_store_byte v32, off, s[16:19], 0 offset:32
; GCN-O0-NEXT: buffer_store_byte v31, off, s[16:19], 0 offset:31
; GCN-O0-NEXT: buffer_store_byte v30, off, s[16:19], 0 offset:30
; GCN-O0-NEXT: buffer_store_byte v29, off, s[16:19], 0 offset:29
; GCN-O0-NEXT: buffer_store_byte v28, off, s[16:19], 0 offset:28
; GCN-O0-NEXT: buffer_store_byte v27, off, s[16:19], 0 offset:27
; GCN-O0-NEXT: buffer_store_byte v26, off, s[16:19], 0 offset:26
; GCN-O0-NEXT: buffer_store_byte v25, off, s[16:19], 0 offset:25
; GCN-O0-NEXT: buffer_store_byte v24, off, s[16:19], 0 offset:24
; GCN-O0-NEXT: buffer_store_byte v23, off, s[16:19], 0 offset:23
; GCN-O0-NEXT: buffer_store_byte v22, off, s[16:19], 0 offset:22
; GCN-O0-NEXT: buffer_store_byte v21, off, s[16:19], 0 offset:21
; GCN-O0-NEXT: buffer_store_byte v20, off, s[16:19], 0 offset:20
; GCN-O0-NEXT: buffer_store_byte v19, off, s[16:19], 0 offset:19
; GCN-O0-NEXT: buffer_store_byte v18, off, s[16:19], 0 offset:18
; GCN-O0-NEXT: buffer_store_byte v17, off, s[16:19], 0 offset:17
; GCN-O0-NEXT: buffer_store_byte v16, off, s[16:19], 0 offset:16
; GCN-O0-NEXT: buffer_store_byte v15, off, s[16:19], 0 offset:15
; GCN-O0-NEXT: buffer_store_byte v14, off, s[16:19], 0 offset:14
; GCN-O0-NEXT: buffer_store_byte v13, off, s[16:19], 0 offset:13
; GCN-O0-NEXT: buffer_store_byte v12, off, s[16:19], 0 offset:12
; GCN-O0-NEXT: buffer_store_byte v11, off, s[16:19], 0 offset:11
; GCN-O0-NEXT: buffer_store_byte v10, off, s[16:19], 0 offset:10
; GCN-O0-NEXT: buffer_store_byte v9, off, s[16:19], 0 offset:9
; GCN-O0-NEXT: buffer_store_byte v8, off, s[16:19], 0 offset:8
; GCN-O0-NEXT: buffer_store_byte v7, off, s[16:19], 0 offset:7
; GCN-O0-NEXT: buffer_store_byte v6, off, s[16:19], 0 offset:6
; GCN-O0-NEXT: buffer_store_byte v5, off, s[16:19], 0 offset:5
; GCN-O0-NEXT: buffer_store_byte v4, off, s[16:19], 0 offset:4
; GCN-O0-NEXT: buffer_store_byte v3, off, s[16:19], 0 offset:3
; GCN-O0-NEXT: buffer_store_byte v2, off, s[16:19], 0 offset:2
; GCN-O0-NEXT: buffer_store_byte v1, off, s[16:19], 0 offset:1
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: buffer_store_byte v0, off, s[16:19], 0
; GCN-O0-NEXT: v_mov_b32_e32 v3, 1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
; GCN-O0-NEXT: buffer_store_byte v3, v0, s[16:19], 0 offen
; GCN-O0-NEXT: buffer_load_ubyte v18, off, s[16:19], 0 offset:23
; GCN-O0-NEXT: buffer_load_ubyte v19, off, s[16:19], 0 offset:22
; GCN-O0-NEXT: buffer_load_ubyte v20, off, s[16:19], 0 offset:21
; GCN-O0-NEXT: buffer_load_ubyte v21, off, s[16:19], 0 offset:20
; GCN-O0-NEXT: buffer_load_ubyte v22, off, s[16:19], 0 offset:19
; GCN-O0-NEXT: buffer_load_ubyte v23, off, s[16:19], 0 offset:18
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v8, off, s[16:19], 0 offset:1
; GCN-O0-NEXT: buffer_load_ubyte v7, off, s[16:19], 0 offset:2
; GCN-O0-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:3
; GCN-O0-NEXT: buffer_load_ubyte v5, off, s[16:19], 0 offset:4
; GCN-O0-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:5
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:6
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:7
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v9, off, s[16:19], 0 offset:8
; GCN-O0-NEXT: buffer_load_ubyte v16, off, s[16:19], 0 offset:9
; GCN-O0-NEXT: buffer_load_ubyte v15, off, s[16:19], 0 offset:10
; GCN-O0-NEXT: buffer_load_ubyte v14, off, s[16:19], 0 offset:11
; GCN-O0-NEXT: buffer_load_ubyte v13, off, s[16:19], 0 offset:12
; GCN-O0-NEXT: buffer_load_ubyte v12, off, s[16:19], 0 offset:13
; GCN-O0-NEXT: buffer_load_ubyte v11, off, s[16:19], 0 offset:14
; GCN-O0-NEXT: buffer_load_ubyte v10, off, s[16:19], 0 offset:15
; GCN-O0-NEXT: buffer_load_ubyte v17, off, s[16:19], 0 offset:16
; GCN-O0-NEXT: buffer_load_ubyte v24, off, s[16:19], 0 offset:17
; GCN-O0-NEXT: buffer_load_ubyte v26, off, s[16:19], 0 offset:31
; GCN-O0-NEXT: buffer_load_ubyte v27, off, s[16:19], 0 offset:30
; GCN-O0-NEXT: buffer_load_ubyte v28, off, s[16:19], 0 offset:29
; GCN-O0-NEXT: buffer_load_ubyte v29, off, s[16:19], 0 offset:28
; GCN-O0-NEXT: buffer_load_ubyte v30, off, s[16:19], 0 offset:27
; GCN-O0-NEXT: buffer_load_ubyte v31, off, s[16:19], 0 offset:26
; GCN-O0-NEXT: buffer_load_ubyte v32, off, s[16:19], 0 offset:25
; GCN-O0-NEXT: buffer_load_ubyte v25, off, s[16:19], 0 offset:24
; GCN-O0-NEXT: buffer_load_ubyte v34, off, s[16:19], 0 offset:39
; GCN-O0-NEXT: buffer_load_ubyte v35, off, s[16:19], 0 offset:38
; GCN-O0-NEXT: buffer_load_ubyte v36, off, s[16:19], 0 offset:37
; GCN-O0-NEXT: buffer_load_ubyte v37, off, s[16:19], 0 offset:36
; GCN-O0-NEXT: buffer_load_ubyte v38, off, s[16:19], 0 offset:35
; GCN-O0-NEXT: buffer_load_ubyte v39, off, s[16:19], 0 offset:34
; GCN-O0-NEXT: buffer_load_ubyte v40, off, s[16:19], 0 offset:33
; GCN-O0-NEXT: buffer_load_ubyte v33, off, s[16:19], 0 offset:32
; GCN-O0-NEXT: buffer_load_ubyte v42, off, s[16:19], 0 offset:47
; GCN-O0-NEXT: buffer_load_ubyte v43, off, s[16:19], 0 offset:46
; GCN-O0-NEXT: buffer_load_ubyte v44, off, s[16:19], 0 offset:45
; GCN-O0-NEXT: buffer_load_ubyte v45, off, s[16:19], 0 offset:44
; GCN-O0-NEXT: buffer_load_ubyte v46, off, s[16:19], 0 offset:43
; GCN-O0-NEXT: buffer_load_ubyte v47, off, s[16:19], 0 offset:42
; GCN-O0-NEXT: buffer_load_ubyte v48, off, s[16:19], 0 offset:41
; GCN-O0-NEXT: buffer_load_ubyte v41, off, s[16:19], 0 offset:40
; GCN-O0-NEXT: buffer_load_ubyte v50, off, s[16:19], 0 offset:55
; GCN-O0-NEXT: buffer_load_ubyte v51, off, s[16:19], 0 offset:54
; GCN-O0-NEXT: buffer_load_ubyte v52, off, s[16:19], 0 offset:53
; GCN-O0-NEXT: buffer_load_ubyte v53, off, s[16:19], 0 offset:52
; GCN-O0-NEXT: buffer_load_ubyte v54, off, s[16:19], 0 offset:51
; GCN-O0-NEXT: buffer_load_ubyte v55, off, s[16:19], 0 offset:50
; GCN-O0-NEXT: buffer_load_ubyte v56, off, s[16:19], 0 offset:49
; GCN-O0-NEXT: buffer_load_ubyte v49, off, s[16:19], 0 offset:48
; GCN-O0-NEXT: buffer_load_ubyte v58, off, s[16:19], 0 offset:63
; GCN-O0-NEXT: buffer_load_ubyte v59, off, s[16:19], 0 offset:62
; GCN-O0-NEXT: buffer_load_ubyte v60, off, s[16:19], 0 offset:61
; GCN-O0-NEXT: buffer_load_ubyte v61, off, s[16:19], 0 offset:60
; GCN-O0-NEXT: buffer_load_ubyte v62, off, s[16:19], 0 offset:59
; GCN-O0-NEXT: buffer_load_ubyte v63, off, s[16:19], 0 offset:58
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:57
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v57, off, s[16:19], 0 offset:56
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:71
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:70
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:69
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:68
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:67
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:66
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:65
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:64
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:79
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:78
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:77
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:76
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:75
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:74
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:73
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:72
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:87
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:86
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:236 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:85
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:212 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:84
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:216 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:83
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:220 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:82
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:224 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:81
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:232 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:80
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:95
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:240 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:94
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:268 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:93
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:244 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:92
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:248 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:91
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:252 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:90
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:256 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:89
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:264 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:88
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:103
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:272 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:102
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:300 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:101
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:276 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:100
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:280 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:99
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:284 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:98
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:288 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:97
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:296 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:96
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:111
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:304 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:110
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:332 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:109
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:308 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:108
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:312 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:107
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:316 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:106
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:320 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:105
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:328 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:104
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:119
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:336 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:118
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:364 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:117
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:340 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:116
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:344 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:115
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:348 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:114
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:352 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:113
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:360 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:112
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:127
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:368 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:126
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:125
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:372 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:124
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:376 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:123
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:380 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:122
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:384 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:121
; GCN-O0-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 offset:120
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:384 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: s_mov_b32 s7, 2
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:380 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: s_mov_b32 s6, 3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:376 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: s_mov_b32 s5, 4
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:372 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: s_mov_b32 s4, 5
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:368 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: s_mov_b32 s3, 6
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_mov_b32 s2, 7
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 15
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:364 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:360 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:356 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:352 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:348 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:344 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:340 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:336 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 14
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:332 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:328 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:324 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:320 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:316 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:312 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:308 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:304 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 13
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:300 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:296 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:292 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:288 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:284 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:280 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:276 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:272 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 12
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:268 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:264 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:260 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:256 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:252 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:248 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:244 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:240 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 11
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:236 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:232 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:228 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:224 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:220 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:216 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:212 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 10
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 9
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, v3, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s7, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s6, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s5, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v1, v1, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s4, v1
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v1
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: s_mov_b64 s[12:13], 8
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s8
; GCN-O0-NEXT: v_mov_b32_e32 v1, s9
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v57, v57, v3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v0, v3, v0
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v0
; GCN-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:128 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_and_b32_e64 v63, v63, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v63, s7, v63
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v63
; GCN-O0-NEXT: v_and_b32_e64 v62, v62, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v62, s6, v62
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v62
; GCN-O0-NEXT: v_and_b32_e64 v61, v61, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v61, s5, v61
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v61
; GCN-O0-NEXT: v_and_b32_e64 v60, v60, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v60, s4, v60
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v60
; GCN-O0-NEXT: v_and_b32_e64 v59, v59, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v59, s3, v59
; GCN-O0-NEXT: v_or_b32_e64 v57, v57, v59
; GCN-O0-NEXT: v_lshlrev_b16_e64 v58, s2, v58
; GCN-O0-NEXT: v_or_b32_e64 v59, v57, v58
; GCN-O0-NEXT: s_mov_b64 s[12:13], 7
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v58, s9
; GCN-O0-NEXT: v_mov_b32_e32 v57, s8
; GCN-O0-NEXT: flat_store_byte v[57:58], v59
; GCN-O0-NEXT: v_and_b32_e64 v49, v49, v3
; GCN-O0-NEXT: v_and_b32_e64 v56, v56, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v56, v3, v56
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v56
; GCN-O0-NEXT: v_and_b32_e64 v55, v55, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v55, s7, v55
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v55
; GCN-O0-NEXT: v_and_b32_e64 v54, v54, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v54, s6, v54
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v54
; GCN-O0-NEXT: v_and_b32_e64 v53, v53, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v53, s5, v53
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v53
; GCN-O0-NEXT: v_and_b32_e64 v52, v52, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v52, s4, v52
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v52
; GCN-O0-NEXT: v_and_b32_e64 v51, v51, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v51, s3, v51
; GCN-O0-NEXT: v_or_b32_e64 v49, v49, v51
; GCN-O0-NEXT: v_lshlrev_b16_e64 v50, s2, v50
; GCN-O0-NEXT: v_or_b32_e64 v51, v49, v50
; GCN-O0-NEXT: s_mov_b64 s[12:13], 6
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v50, s9
; GCN-O0-NEXT: v_mov_b32_e32 v49, s8
; GCN-O0-NEXT: flat_store_byte v[49:50], v51
; GCN-O0-NEXT: v_and_b32_e64 v41, v41, v3
; GCN-O0-NEXT: v_and_b32_e64 v48, v48, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v48, v3, v48
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v48
; GCN-O0-NEXT: v_and_b32_e64 v47, v47, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v47, s7, v47
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v47
; GCN-O0-NEXT: v_and_b32_e64 v46, v46, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v46, s6, v46
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v46
; GCN-O0-NEXT: v_and_b32_e64 v45, v45, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v45, s5, v45
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v45
; GCN-O0-NEXT: v_and_b32_e64 v44, v44, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v44, s4, v44
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v44
; GCN-O0-NEXT: v_and_b32_e64 v43, v43, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v43, s3, v43
; GCN-O0-NEXT: v_or_b32_e64 v41, v41, v43
; GCN-O0-NEXT: v_lshlrev_b16_e64 v42, s2, v42
; GCN-O0-NEXT: v_or_b32_e64 v43, v41, v42
; GCN-O0-NEXT: s_mov_b64 s[12:13], 5
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v42, s9
; GCN-O0-NEXT: v_mov_b32_e32 v41, s8
; GCN-O0-NEXT: flat_store_byte v[41:42], v43
; GCN-O0-NEXT: v_and_b32_e64 v33, v33, v3
; GCN-O0-NEXT: v_and_b32_e64 v40, v40, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v40, v3, v40
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v40
; GCN-O0-NEXT: v_and_b32_e64 v39, v39, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v39, s7, v39
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v39
; GCN-O0-NEXT: v_and_b32_e64 v38, v38, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v38, s6, v38
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v38
; GCN-O0-NEXT: v_and_b32_e64 v37, v37, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v37, s5, v37
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v37
; GCN-O0-NEXT: v_and_b32_e64 v36, v36, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v36, s4, v36
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v36
; GCN-O0-NEXT: v_and_b32_e64 v35, v35, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v35, s3, v35
; GCN-O0-NEXT: v_or_b32_e64 v33, v33, v35
; GCN-O0-NEXT: v_lshlrev_b16_e64 v34, s2, v34
; GCN-O0-NEXT: v_or_b32_e64 v35, v33, v34
; GCN-O0-NEXT: s_mov_b64 s[12:13], 4
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v34, s9
; GCN-O0-NEXT: v_mov_b32_e32 v33, s8
; GCN-O0-NEXT: flat_store_byte v[33:34], v35
; GCN-O0-NEXT: v_and_b32_e64 v25, v25, v3
; GCN-O0-NEXT: v_and_b32_e64 v32, v32, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v32, v3, v32
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v32
; GCN-O0-NEXT: v_and_b32_e64 v31, v31, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v31, s7, v31
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v31
; GCN-O0-NEXT: v_and_b32_e64 v30, v30, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v30, s6, v30
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v30
; GCN-O0-NEXT: v_and_b32_e64 v29, v29, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v29, s5, v29
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v29
; GCN-O0-NEXT: v_and_b32_e64 v28, v28, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v28, s4, v28
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v28
; GCN-O0-NEXT: v_and_b32_e64 v27, v27, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v27, s3, v27
; GCN-O0-NEXT: v_or_b32_e64 v25, v25, v27
; GCN-O0-NEXT: v_lshlrev_b16_e64 v26, s2, v26
; GCN-O0-NEXT: v_or_b32_e64 v27, v25, v26
; GCN-O0-NEXT: s_mov_b64 s[12:13], 3
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v26, s9
; GCN-O0-NEXT: v_mov_b32_e32 v25, s8
; GCN-O0-NEXT: flat_store_byte v[25:26], v27
; GCN-O0-NEXT: v_and_b32_e64 v17, v17, v3
; GCN-O0-NEXT: v_and_b32_e64 v24, v24, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v24, v3, v24
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v24
; GCN-O0-NEXT: v_and_b32_e64 v23, v23, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v23, s7, v23
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v23
; GCN-O0-NEXT: v_and_b32_e64 v22, v22, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v22, s6, v22
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v22
; GCN-O0-NEXT: v_and_b32_e64 v21, v21, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v21, s5, v21
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v21
; GCN-O0-NEXT: v_and_b32_e64 v20, v20, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v20, s4, v20
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v20
; GCN-O0-NEXT: v_and_b32_e64 v19, v19, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v19, s3, v19
; GCN-O0-NEXT: v_or_b32_e64 v17, v17, v19
; GCN-O0-NEXT: v_lshlrev_b16_e64 v18, s2, v18
; GCN-O0-NEXT: v_or_b32_e64 v19, v17, v18
; GCN-O0-NEXT: s_mov_b64 s[12:13], 2
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v18, s9
; GCN-O0-NEXT: v_mov_b32_e32 v17, s8
; GCN-O0-NEXT: flat_store_byte v[17:18], v19
; GCN-O0-NEXT: v_and_b32_e64 v9, v9, v3
; GCN-O0-NEXT: v_and_b32_e64 v16, v16, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v16, v3, v16
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v16
; GCN-O0-NEXT: v_and_b32_e64 v15, v15, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v15, s7, v15
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v15
; GCN-O0-NEXT: v_and_b32_e64 v14, v14, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v14, s6, v14
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v14
; GCN-O0-NEXT: v_and_b32_e64 v13, v13, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v13, s5, v13
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v13
; GCN-O0-NEXT: v_and_b32_e64 v12, v12, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v12, s4, v12
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v12
; GCN-O0-NEXT: v_and_b32_e64 v11, v11, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v11, s3, v11
; GCN-O0-NEXT: v_or_b32_e64 v9, v9, v11
; GCN-O0-NEXT: v_lshlrev_b16_e64 v10, s2, v10
; GCN-O0-NEXT: v_or_b32_e64 v11, v9, v10
; GCN-O0-NEXT: s_mov_b64 s[12:13], 1
; GCN-O0-NEXT: s_mov_b32 s8, s0
; GCN-O0-NEXT: s_mov_b32 s9, s1
; GCN-O0-NEXT: s_mov_b32 s11, s12
; GCN-O0-NEXT: s_mov_b32 s10, s13
; GCN-O0-NEXT: s_add_u32 s8, s8, s11
; GCN-O0-NEXT: s_addc_u32 s10, s9, s10
; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s9, s10
; GCN-O0-NEXT: v_mov_b32_e32 v10, s9
; GCN-O0-NEXT: v_mov_b32_e32 v9, s8
; GCN-O0-NEXT: flat_store_byte v[9:10], v11
; GCN-O0-NEXT: s_waitcnt vmcnt(7)
; GCN-O0-NEXT: v_and_b32_e64 v0, v0, v3
; GCN-O0-NEXT: v_and_b32_e64 v8, v8, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v8, v3, v8
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v8
; GCN-O0-NEXT: v_and_b32_e64 v7, v7, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v7, s7, v7
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v7
; GCN-O0-NEXT: v_and_b32_e64 v6, v6, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v6, s6, v6
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v6
; GCN-O0-NEXT: v_and_b32_e64 v5, v5, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v5, s5, v5
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v5
; GCN-O0-NEXT: v_and_b32_e64 v4, v4, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v4, s4, v4
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v4
; GCN-O0-NEXT: v_and_b32_e64 v2, v2, v3
; GCN-O0-NEXT: v_lshlrev_b16_e64 v2, s3, v2
; GCN-O0-NEXT: v_or_b32_e64 v0, v0, v2
; GCN-O0-NEXT: v_lshlrev_b16_e64 v1, s2, v1
; GCN-O0-NEXT: v_or_b32_e64 v2, v0, v1
; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
; GCN-O0-NEXT: flat_store_byte v[0:1], v2
; GCN-O0-NEXT: s_endpgm
entry:
%v = insertelement <128 x i1> %vec, i1 1, i32 %sel
store <128 x i1> %v, ptr addrspace(1) %out
ret void
}
define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
; GCN-LABEL: float32_inselt_vec:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 3, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 4, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 5, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], 6, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[10:11], 7, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[12:13], 8, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[14:15], 9, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[16:17], 10, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[18:19], 11, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[20:21], 12, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[22:23], 13, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[24:25], 14, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[26:27], 15, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[28:29], 16, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[30:31], 17, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[34:35], 18, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[36:37], 19, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[38:39], 20, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[40:41], 21, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[42:43], 22, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[44:45], 23, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[46:47], 24, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[48:49], 25, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[50:51], 26, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[52:53], 27, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[54:55], 28, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[56:57], 29, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[58:59], 30, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[60:61], 31, v32
; GCN-NEXT: v_cmp_ne_u32_e64 s[62:63], 0, v32
; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[62:63]
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v5, 1.0, v5, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, v6, s[8:9]
; GCN-NEXT: v_cndmask_b32_e64 v7, 1.0, v7, s[10:11]
; GCN-NEXT: v_cndmask_b32_e64 v8, 1.0, v8, s[12:13]
; GCN-NEXT: v_cndmask_b32_e64 v9, 1.0, v9, s[14:15]
; GCN-NEXT: v_cndmask_b32_e64 v10, 1.0, v10, s[16:17]
; GCN-NEXT: v_cndmask_b32_e64 v11, 1.0, v11, s[18:19]
; GCN-NEXT: v_cndmask_b32_e64 v12, 1.0, v12, s[20:21]
; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, v13, s[22:23]
; GCN-NEXT: v_cndmask_b32_e64 v14, 1.0, v14, s[24:25]
; GCN-NEXT: v_cndmask_b32_e64 v15, 1.0, v15, s[26:27]
; GCN-NEXT: v_cndmask_b32_e64 v16, 1.0, v16, s[28:29]
; GCN-NEXT: v_cndmask_b32_e64 v17, 1.0, v17, s[30:31]
; GCN-NEXT: v_cndmask_b32_e64 v18, 1.0, v18, s[34:35]
; GCN-NEXT: v_cndmask_b32_e64 v19, 1.0, v19, s[36:37]
; GCN-NEXT: v_cndmask_b32_e64 v20, 1.0, v20, s[38:39]
; GCN-NEXT: v_cndmask_b32_e64 v21, 1.0, v21, s[40:41]
; GCN-NEXT: v_cndmask_b32_e64 v22, 1.0, v22, s[42:43]
; GCN-NEXT: v_cndmask_b32_e64 v23, 1.0, v23, s[44:45]
; GCN-NEXT: v_cndmask_b32_e64 v24, 1.0, v24, s[46:47]
; GCN-NEXT: v_cndmask_b32_e64 v25, 1.0, v25, s[48:49]
; GCN-NEXT: v_cndmask_b32_e64 v26, 1.0, v26, s[50:51]
; GCN-NEXT: v_cndmask_b32_e64 v27, 1.0, v27, s[52:53]
; GCN-NEXT: v_cndmask_b32_e64 v28, 1.0, v28, s[54:55]
; GCN-NEXT: v_cndmask_b32_e64 v29, 1.0, v29, s[56:57]
; GCN-NEXT: v_cndmask_b32_e64 v30, 1.0, v30, s[58:59]
; GCN-NEXT: v_cndmask_b32_e64 v31, 1.0, v31, s[60:61]
; GCN-NEXT: ; return to shader part epilog
;
; GCN-O0-LABEL: float32_inselt_vec:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-O0-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s10, -1
; GCN-O0-NEXT: s_mov_b32 s11, 0xe80000
; GCN-O0-NEXT: s_add_u32 s8, s8, s0
; GCN-O0-NEXT: s_addc_u32 s9, s9, 0
; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_mov_b32_e32 v32, v31
; GCN-O0-NEXT: v_mov_b32_e32 v33, v30
; GCN-O0-NEXT: v_mov_b32_e32 v34, v29
; GCN-O0-NEXT: v_mov_b32_e32 v35, v28
; GCN-O0-NEXT: v_mov_b32_e32 v36, v27
; GCN-O0-NEXT: v_mov_b32_e32 v37, v26
; GCN-O0-NEXT: v_mov_b32_e32 v38, v25
; GCN-O0-NEXT: v_mov_b32_e32 v39, v24
; GCN-O0-NEXT: v_mov_b32_e32 v40, v23
; GCN-O0-NEXT: v_mov_b32_e32 v41, v22
; GCN-O0-NEXT: v_mov_b32_e32 v42, v21
; GCN-O0-NEXT: v_mov_b32_e32 v43, v20
; GCN-O0-NEXT: v_mov_b32_e32 v44, v19
; GCN-O0-NEXT: v_mov_b32_e32 v45, v18
; GCN-O0-NEXT: v_mov_b32_e32 v46, v17
; GCN-O0-NEXT: v_mov_b32_e32 v47, v16
; GCN-O0-NEXT: v_mov_b32_e32 v48, v15
; GCN-O0-NEXT: v_mov_b32_e32 v49, v14
; GCN-O0-NEXT: v_mov_b32_e32 v50, v13
; GCN-O0-NEXT: v_mov_b32_e32 v51, v12
; GCN-O0-NEXT: v_mov_b32_e32 v52, v11
; GCN-O0-NEXT: v_mov_b32_e32 v53, v10
; GCN-O0-NEXT: v_mov_b32_e32 v54, v9
; GCN-O0-NEXT: v_mov_b32_e32 v55, v8
; GCN-O0-NEXT: v_mov_b32_e32 v56, v7
; GCN-O0-NEXT: v_mov_b32_e32 v57, v6
; GCN-O0-NEXT: v_mov_b32_e32 v58, v5
; GCN-O0-NEXT: v_mov_b32_e32 v59, v4
; GCN-O0-NEXT: v_mov_b32_e32 v60, v3
; GCN-O0-NEXT: v_mov_b32_e32 v61, v2
; GCN-O0-NEXT: v_mov_b32_e32 v62, v1
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v1, v62
; GCN-O0-NEXT: v_mov_b32_e32 v2, v61
; GCN-O0-NEXT: v_mov_b32_e32 v3, v60
; GCN-O0-NEXT: v_mov_b32_e32 v4, v59
; GCN-O0-NEXT: v_mov_b32_e32 v5, v58
; GCN-O0-NEXT: v_mov_b32_e32 v6, v57
; GCN-O0-NEXT: v_mov_b32_e32 v7, v56
; GCN-O0-NEXT: v_mov_b32_e32 v8, v55
; GCN-O0-NEXT: v_mov_b32_e32 v9, v54
; GCN-O0-NEXT: v_mov_b32_e32 v10, v53
; GCN-O0-NEXT: v_mov_b32_e32 v11, v52
; GCN-O0-NEXT: v_mov_b32_e32 v12, v51
; GCN-O0-NEXT: v_mov_b32_e32 v13, v50
; GCN-O0-NEXT: v_mov_b32_e32 v14, v49
; GCN-O0-NEXT: v_mov_b32_e32 v15, v48
; GCN-O0-NEXT: v_mov_b32_e32 v16, v47
; GCN-O0-NEXT: v_mov_b32_e32 v17, v46
; GCN-O0-NEXT: v_mov_b32_e32 v18, v45
; GCN-O0-NEXT: v_mov_b32_e32 v19, v44
; GCN-O0-NEXT: v_mov_b32_e32 v20, v43
; GCN-O0-NEXT: v_mov_b32_e32 v21, v42
; GCN-O0-NEXT: v_mov_b32_e32 v22, v41
; GCN-O0-NEXT: v_mov_b32_e32 v23, v40
; GCN-O0-NEXT: v_mov_b32_e32 v24, v39
; GCN-O0-NEXT: v_mov_b32_e32 v25, v38
; GCN-O0-NEXT: v_mov_b32_e32 v26, v37
; GCN-O0-NEXT: v_mov_b32_e32 v27, v36
; GCN-O0-NEXT: v_mov_b32_e32 v28, v35
; GCN-O0-NEXT: v_mov_b32_e32 v29, v34
; GCN-O0-NEXT: v_mov_b32_e32 v30, v33
; GCN-O0-NEXT: v_mov_b32_e32 v31, v32
; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
; GCN-O0-NEXT: v_mov_b32_e32 v32, 1.0
; GCN-O0-NEXT: buffer_store_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr64 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v64, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v64, s1, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr0_sgpr1
; GCN-O0-NEXT: .LBB22_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v64, 2
; GCN-O0-NEXT: v_readlane_b32 s1, v64, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s2, v33
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v33
; GCN-O0-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GCN-O0-NEXT: s_mov_b32 m0, s2
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v32
; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v21, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v22, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v23, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v24, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v29, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v30, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v31, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT: v_writelane_b32 v64, s2, 2
; GCN-O0-NEXT: v_writelane_b32 v64, s3, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v64, off, s[8:11], 0 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execnz .LBB22_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v64, off, s[8:11], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v64, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v64, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v36, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v40, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v44, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v48, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v52, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v56, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v60, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v31
; GCN-O0-NEXT: v_mov_b32_e32 v1, v32
; GCN-O0-NEXT: v_mov_b32_e32 v2, v33
; GCN-O0-NEXT: v_mov_b32_e32 v3, v34
; GCN-O0-NEXT: v_mov_b32_e32 v4, v35
; GCN-O0-NEXT: v_mov_b32_e32 v5, v36
; GCN-O0-NEXT: v_mov_b32_e32 v6, v37
; GCN-O0-NEXT: v_mov_b32_e32 v7, v38
; GCN-O0-NEXT: v_mov_b32_e32 v8, v39
; GCN-O0-NEXT: v_mov_b32_e32 v9, v40
; GCN-O0-NEXT: v_mov_b32_e32 v10, v41
; GCN-O0-NEXT: v_mov_b32_e32 v11, v42
; GCN-O0-NEXT: v_mov_b32_e32 v12, v43
; GCN-O0-NEXT: v_mov_b32_e32 v13, v44
; GCN-O0-NEXT: v_mov_b32_e32 v14, v45
; GCN-O0-NEXT: v_mov_b32_e32 v15, v46
; GCN-O0-NEXT: v_mov_b32_e32 v16, v47
; GCN-O0-NEXT: v_mov_b32_e32 v17, v48
; GCN-O0-NEXT: s_waitcnt vmcnt(13)
; GCN-O0-NEXT: v_mov_b32_e32 v18, v49
; GCN-O0-NEXT: s_waitcnt vmcnt(12)
; GCN-O0-NEXT: v_mov_b32_e32 v19, v50
; GCN-O0-NEXT: s_waitcnt vmcnt(11)
; GCN-O0-NEXT: v_mov_b32_e32 v20, v51
; GCN-O0-NEXT: s_waitcnt vmcnt(10)
; GCN-O0-NEXT: v_mov_b32_e32 v21, v52
; GCN-O0-NEXT: s_waitcnt vmcnt(9)
; GCN-O0-NEXT: v_mov_b32_e32 v22, v53
; GCN-O0-NEXT: s_waitcnt vmcnt(8)
; GCN-O0-NEXT: v_mov_b32_e32 v23, v54
; GCN-O0-NEXT: s_waitcnt vmcnt(7)
; GCN-O0-NEXT: v_mov_b32_e32 v24, v55
; GCN-O0-NEXT: s_waitcnt vmcnt(6)
; GCN-O0-NEXT: v_mov_b32_e32 v25, v56
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v26, v57
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v27, v58
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v28, v59
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v29, v60
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v30, v61
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v31, v62
; GCN-O0-NEXT: ; return to shader part epilog
entry:
%v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
ret <32 x float> %v
}
define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
; GCN-LABEL: double8_inselt_vec:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GCN-NEXT: v_mov_b32_e32 v17, 0x3ff00000
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
; GCN-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
; GCN-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
; GCN-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
; GCN-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: double8_inselt_vec:
; GCN-O0: ; %bb.0: ; %entry
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v17, v15
; GCN-O0-NEXT: v_mov_b32_e32 v18, v14
; GCN-O0-NEXT: v_mov_b32_e32 v19, v13
; GCN-O0-NEXT: v_mov_b32_e32 v20, v12
; GCN-O0-NEXT: v_mov_b32_e32 v21, v11
; GCN-O0-NEXT: v_mov_b32_e32 v22, v10
; GCN-O0-NEXT: v_mov_b32_e32 v23, v9
; GCN-O0-NEXT: v_mov_b32_e32 v24, v8
; GCN-O0-NEXT: v_mov_b32_e32 v25, v7
; GCN-O0-NEXT: v_mov_b32_e32 v26, v6
; GCN-O0-NEXT: v_mov_b32_e32 v27, v5
; GCN-O0-NEXT: v_mov_b32_e32 v28, v4
; GCN-O0-NEXT: v_mov_b32_e32 v29, v3
; GCN-O0-NEXT: v_mov_b32_e32 v30, v2
; GCN-O0-NEXT: v_mov_b32_e32 v31, v1
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v1, v31
; GCN-O0-NEXT: v_mov_b32_e32 v2, v30
; GCN-O0-NEXT: v_mov_b32_e32 v3, v29
; GCN-O0-NEXT: v_mov_b32_e32 v4, v28
; GCN-O0-NEXT: v_mov_b32_e32 v5, v27
; GCN-O0-NEXT: v_mov_b32_e32 v6, v26
; GCN-O0-NEXT: v_mov_b32_e32 v7, v25
; GCN-O0-NEXT: v_mov_b32_e32 v8, v24
; GCN-O0-NEXT: v_mov_b32_e32 v9, v23
; GCN-O0-NEXT: v_mov_b32_e32 v10, v22
; GCN-O0-NEXT: v_mov_b32_e32 v11, v21
; GCN-O0-NEXT: v_mov_b32_e32 v12, v20
; GCN-O0-NEXT: v_mov_b32_e32 v13, v19
; GCN-O0-NEXT: v_mov_b32_e32 v14, v18
; GCN-O0-NEXT: v_mov_b32_e32 v15, v17
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, 1
; GCN-O0-NEXT: v_lshlrev_b32_e64 v16, s4, v16
; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], 1.0
; GCN-O0-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v33, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v33, s5, 1
; GCN-O0-NEXT: v_mov_b32_e32 v16, s4
; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v33, s4, 2
; GCN-O0-NEXT: v_writelane_b32 v33, s5, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB23_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v33, 4
; GCN-O0-NEXT: v_readlane_b32 s5, v33, 5
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v16
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v33, s6, 4
; GCN-O0-NEXT: v_writelane_b32 v33, s7, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB23_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v33, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v33, 3
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v33, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v33, 1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s4, s5
; GCN-O0-NEXT: v_mov_b32_e32 v16, s4
; GCN-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: v_writelane_b32 v33, s4, 6
; GCN-O0-NEXT: v_writelane_b32 v33, s5, 7
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB23_4: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v33, 8
; GCN-O0-NEXT: v_readlane_b32 s5, v33, 9
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v17
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v17
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v1, v16
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v33, s6, 8
; GCN-O0-NEXT: v_writelane_b32 v33, s7, 9
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB23_4
; GCN-O0-NEXT: ; %bb.5:
; GCN-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[10:11]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v33, 6
; GCN-O0-NEXT: v_readlane_b32 s5, v33, 7
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.6:
; GCN-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(14)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v15
; GCN-O0-NEXT: v_mov_b32_e32 v1, v16
; GCN-O0-NEXT: s_waitcnt vmcnt(13)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v17
; GCN-O0-NEXT: s_waitcnt vmcnt(12)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v18
; GCN-O0-NEXT: s_waitcnt vmcnt(11)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v19
; GCN-O0-NEXT: s_waitcnt vmcnt(10)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v20
; GCN-O0-NEXT: s_waitcnt vmcnt(9)
; GCN-O0-NEXT: v_mov_b32_e32 v6, v21
; GCN-O0-NEXT: s_waitcnt vmcnt(8)
; GCN-O0-NEXT: v_mov_b32_e32 v7, v22
; GCN-O0-NEXT: s_waitcnt vmcnt(7)
; GCN-O0-NEXT: v_mov_b32_e32 v8, v23
; GCN-O0-NEXT: s_waitcnt vmcnt(6)
; GCN-O0-NEXT: v_mov_b32_e32 v9, v24
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v10, v25
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v11, v26
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v12, v27
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v13, v28
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v14, v29
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v15, v30
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
entry:
%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
ret <8 x double> %v
}
define <3 x i32> @insert_dyn_i32_3(<3 x i32> inreg %arg, i32 %idx, i32 %val) {
; GCN-LABEL: insert_dyn_i32_3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v4
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_i32_3:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB24_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB24_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v4
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <3 x i32> %arg, i32 %val, i32 %idx
ret <3 x i32> %x
}
define <3 x i32> @insert_dyn_inreg_i32_3(<3 x i32> inreg %arg, i32 inreg %idx, i32 %val) {
; GCN-LABEL: insert_dyn_inreg_i32_3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s19, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s19, 1
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s19, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_i32_3:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: s_mov_b32 m0, s19
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: v_mov_b32_e32 v2, v4
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <3 x i32> %arg, i32 %val, i32 %idx
ret <3 x i32> %x
}
define <3 x float> @insert_dyn_float_3(<3 x float> inreg %arg, i32 %idx, float %val) {
; GCN-LABEL: insert_dyn_float_3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v4
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_float_3:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v5, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v5, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v5, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v5, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v4
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v4
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v3
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v5, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v5, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB26_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v5, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v5, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v4
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <3 x float> %arg, float %val, i32 %idx
ret <3 x float> %x
}
define <3 x float> @insert_dyn_inreg_float_3(<3 x float> inreg %arg, i32 inreg %idx, float %val) {
; GCN-LABEL: insert_dyn_inreg_float_3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s19, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s19, 1
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s19, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_float_3:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10 killed $sgpr4_sgpr5_sgpr6
; GCN-O0-NEXT: v_mov_b32_e32 v2, s4
; GCN-O0-NEXT: v_mov_b32_e32 v3, s5
; GCN-O0-NEXT: v_mov_b32_e32 v4, s6
; GCN-O0-NEXT: s_mov_b32 m0, s19
; GCN-O0-NEXT: v_movreld_b32_e32 v2, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v2
; GCN-O0-NEXT: v_mov_b32_e32 v1, v3
; GCN-O0-NEXT: v_mov_b32_e32 v2, v4
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <3 x float> %arg, float %val, i32 %idx
ret <3 x float> %x
}
define <5 x i32> @insert_dyn_i32_5(<5 x i32> inreg %arg, i32 %idx, i32 %val) {
; GCN-LABEL: insert_dyn_i32_5:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: v_mov_b32_e32 v1, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_i32_5:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB28_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v5
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v8
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <5 x i32> %arg, i32 %val, i32 %idx
ret <5 x i32> %x
}
define <5 x i32> @insert_dyn_inreg_i32_5(<5 x i32> inreg %arg, i32 inreg %idx, i32 %val) {
; GCN-LABEL: insert_dyn_inreg_i32_5:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 1
; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_i32_5:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: v_mov_b32_e32 v4, s4
; GCN-O0-NEXT: v_mov_b32_e32 v5, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, s6
; GCN-O0-NEXT: v_mov_b32_e32 v7, s7
; GCN-O0-NEXT: v_mov_b32_e32 v8, s8
; GCN-O0-NEXT: s_mov_b32 m0, s21
; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: v_mov_b32_e32 v1, v5
; GCN-O0-NEXT: v_mov_b32_e32 v2, v6
; GCN-O0-NEXT: v_mov_b32_e32 v3, v7
; GCN-O0-NEXT: v_mov_b32_e32 v4, v8
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <5 x i32> %arg, i32 %val, i32 %idx
ret <5 x i32> %x
}
define <5 x float> @insert_dyn_float_5(<5 x float> inreg %arg, i32 %idx, float %val) {
; GCN-LABEL: insert_dyn_float_5:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: v_mov_b32_e32 v1, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_float_5:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr9 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v9, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v9, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB30_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v9, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v9, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v6
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v6
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v5
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v9, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v9, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB30_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v9, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v9, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v5
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v8
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <5 x float> %arg, float %val, i32 %idx
ret <5 x float> %x
}
define <5 x float> @insert_dyn_inreg_float_5(<5 x float> inreg %arg, i32 inreg %idx, float %val) {
; GCN-LABEL: insert_dyn_inreg_float_5:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 1
; GCN-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s21, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_float_5:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8
; GCN-O0-NEXT: v_mov_b32_e32 v4, s4
; GCN-O0-NEXT: v_mov_b32_e32 v5, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, s6
; GCN-O0-NEXT: v_mov_b32_e32 v7, s7
; GCN-O0-NEXT: v_mov_b32_e32 v8, s8
; GCN-O0-NEXT: s_mov_b32 m0, s21
; GCN-O0-NEXT: v_movreld_b32_e32 v4, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v4
; GCN-O0-NEXT: v_mov_b32_e32 v1, v5
; GCN-O0-NEXT: v_mov_b32_e32 v2, v6
; GCN-O0-NEXT: v_mov_b32_e32 v3, v7
; GCN-O0-NEXT: v_mov_b32_e32 v4, v8
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <5 x float> %arg, float %val, i32 %idx
ret <5 x float> %x
}
define <6 x i32> @insert_dyn_i32_6(<6 x i32> inreg %arg, i32 %idx, i32 %val) {
; GCN-LABEL: insert_dyn_i32_6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: v_mov_b32_e32 v1, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_i32_6:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB32_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB32_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v8
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v9
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v10
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <6 x i32> %arg, i32 %val, i32 %idx
ret <6 x i32> %x
}
define <6 x i32> @insert_dyn_inreg_i32_6(<6 x i32> inreg %arg, i32 inreg %idx, i32 %val) {
; GCN-LABEL: insert_dyn_inreg_i32_6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s22, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 1
; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 5
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_i32_6:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: v_mov_b32_e32 v10, s9
; GCN-O0-NEXT: v_mov_b32_e32 v9, s8
; GCN-O0-NEXT: v_mov_b32_e32 v8, s7
; GCN-O0-NEXT: v_mov_b32_e32 v7, s6
; GCN-O0-NEXT: v_mov_b32_e32 v6, s5
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
; GCN-O0-NEXT: s_mov_b32 m0, s22
; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: v_mov_b32_e32 v3, v8
; GCN-O0-NEXT: v_mov_b32_e32 v4, v9
; GCN-O0-NEXT: v_mov_b32_e32 v5, v10
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <6 x i32> %arg, i32 %val, i32 %idx
ret <6 x i32> %x
}
define <6 x float> @insert_dyn_float_6(<6 x float> inreg %arg, i32 %idx, float %val) {
; GCN-LABEL: insert_dyn_float_6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: v_mov_b32_e32 v1, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_float_6:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr11 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v11, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v11, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB34_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v11, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v11, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v7
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v6
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v11, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v11, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB34_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[22:23], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[22:23]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v11, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v11, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v8
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v9
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v10
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <6 x float> %arg, float %val, i32 %idx
ret <6 x float> %x
}
define <6 x float> @insert_dyn_inreg_float_6(<6 x float> inreg %arg, i32 inreg %idx, float %val) {
; GCN-LABEL: insert_dyn_inreg_float_6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s22, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 1
; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s22, 5
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_float_6:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
; GCN-O0-NEXT: v_mov_b32_e32 v10, s9
; GCN-O0-NEXT: v_mov_b32_e32 v9, s8
; GCN-O0-NEXT: v_mov_b32_e32 v8, s7
; GCN-O0-NEXT: v_mov_b32_e32 v7, s6
; GCN-O0-NEXT: v_mov_b32_e32 v6, s5
; GCN-O0-NEXT: v_mov_b32_e32 v5, s4
; GCN-O0-NEXT: s_mov_b32 m0, s22
; GCN-O0-NEXT: v_movreld_b32_e32 v5, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v5
; GCN-O0-NEXT: v_mov_b32_e32 v1, v6
; GCN-O0-NEXT: v_mov_b32_e32 v2, v7
; GCN-O0-NEXT: v_mov_b32_e32 v3, v8
; GCN-O0-NEXT: v_mov_b32_e32 v4, v9
; GCN-O0-NEXT: v_mov_b32_e32 v5, v10
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <6 x float> %arg, float %val, i32 %idx
ret <6 x float> %x
}
define <7 x i32> @insert_dyn_i32_7(<7 x i32> inreg %arg, i32 %idx, i32 %val) {
; GCN-LABEL: insert_dyn_i32_7:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v6, s22
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: v_mov_b32_e32 v1, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_i32_7:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: s_mov_b32 s10, s22
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB36_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB36_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(6)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v8
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v9
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v10
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v11
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <7 x i32> %arg, i32 %val, i32 %idx
ret <7 x i32> %x
}
define <7 x i32> @insert_dyn_inreg_i32_7(<7 x i32> inreg %arg, i32 inreg %idx, i32 %val) {
; GCN-LABEL: insert_dyn_inreg_i32_7:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s23, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 1
; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 5
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 6
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v6, s22
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_i32_7:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: s_mov_b32 s10, s22
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: v_mov_b32_e32 v12, s10
; GCN-O0-NEXT: v_mov_b32_e32 v11, s9
; GCN-O0-NEXT: v_mov_b32_e32 v10, s8
; GCN-O0-NEXT: v_mov_b32_e32 v9, s7
; GCN-O0-NEXT: v_mov_b32_e32 v8, s6
; GCN-O0-NEXT: v_mov_b32_e32 v7, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: s_mov_b32 m0, s23
; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v6
; GCN-O0-NEXT: v_mov_b32_e32 v1, v7
; GCN-O0-NEXT: v_mov_b32_e32 v2, v8
; GCN-O0-NEXT: v_mov_b32_e32 v3, v9
; GCN-O0-NEXT: v_mov_b32_e32 v4, v10
; GCN-O0-NEXT: v_mov_b32_e32 v5, v11
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <7 x i32> %arg, i32 %val, i32 %idx
ret <7 x i32> %x
}
define <7 x float> @insert_dyn_float_7(<7 x float> inreg %arg, i32 %idx, float %val) {
; GCN-LABEL: insert_dyn_float_7:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v8, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v6, s22
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: v_mov_b32_e32 v1, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_float_7:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: s_mov_b32 s10, s22
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: v_mov_b32_e32 v4, s8
; GCN-O0-NEXT: v_mov_b32_e32 v5, s9
; GCN-O0-NEXT: v_mov_b32_e32 v6, s10
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
; GCN-O0-NEXT: ; implicit-def: $vgpr13 : SGPR spill to VGPR lane
; GCN-O0-NEXT: v_writelane_b32 v13, s4, 0
; GCN-O0-NEXT: v_writelane_b32 v13, s5, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-O0-NEXT: .LBB38_1: ; =>This Inner Loop Header: Depth=1
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v13, 2
; GCN-O0-NEXT: v_readlane_b32 s5, v13, 3
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readfirstlane_b32 s6, v8
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v8
; GCN-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GCN-O0-NEXT: s_mov_b32 m0, s6
; GCN-O0-NEXT: v_movreld_b32_e32 v0, v7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-O0-NEXT: v_writelane_b32 v13, s6, 2
; GCN-O0-NEXT: v_writelane_b32 v13, s7, 3
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB38_1
; GCN-O0-NEXT: ; %bb.2:
; GCN-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[26:27]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v13, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v13, 1
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: ; %bb.3:
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(6)
; GCN-O0-NEXT: v_mov_b32_e32 v0, v6
; GCN-O0-NEXT: s_waitcnt vmcnt(5)
; GCN-O0-NEXT: v_mov_b32_e32 v1, v7
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_mov_b32_e32 v2, v8
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: v_mov_b32_e32 v3, v9
; GCN-O0-NEXT: s_waitcnt vmcnt(2)
; GCN-O0-NEXT: v_mov_b32_e32 v4, v10
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_mov_b32_e32 v5, v11
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <7 x float> %arg, float %val, i32 %idx
ret <7 x float> %x
}
define <7 x float> @insert_dyn_inreg_float_7(<7 x float> inreg %arg, i32 inreg %idx, float %val) {
; GCN-LABEL: insert_dyn_inreg_float_7:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s23, 0
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 1
; GCN-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v3, s19
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 4
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s20
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 5
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s21
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s23, 6
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v6, s22
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, v7
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-O0-LABEL: insert_dyn_inreg_float_7:
; GCN-O0: ; %bb.0:
; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b32 s4, s16
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: s_mov_b32 s5, s17
; GCN-O0-NEXT: s_mov_b32 s6, s18
; GCN-O0-NEXT: s_mov_b32 s7, s19
; GCN-O0-NEXT: s_mov_b32 s8, s20
; GCN-O0-NEXT: s_mov_b32 s9, s21
; GCN-O0-NEXT: s_mov_b32 s10, s22
; GCN-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18 killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
; GCN-O0-NEXT: v_mov_b32_e32 v12, s10
; GCN-O0-NEXT: v_mov_b32_e32 v11, s9
; GCN-O0-NEXT: v_mov_b32_e32 v10, s8
; GCN-O0-NEXT: v_mov_b32_e32 v9, s7
; GCN-O0-NEXT: v_mov_b32_e32 v8, s6
; GCN-O0-NEXT: v_mov_b32_e32 v7, s5
; GCN-O0-NEXT: v_mov_b32_e32 v6, s4
; GCN-O0-NEXT: s_mov_b32 m0, s23
; GCN-O0-NEXT: v_movreld_b32_e32 v6, v0
; GCN-O0-NEXT: v_mov_b32_e32 v0, v6
; GCN-O0-NEXT: v_mov_b32_e32 v1, v7
; GCN-O0-NEXT: v_mov_b32_e32 v2, v8
; GCN-O0-NEXT: v_mov_b32_e32 v3, v9
; GCN-O0-NEXT: v_mov_b32_e32 v4, v10
; GCN-O0-NEXT: v_mov_b32_e32 v5, v11
; GCN-O0-NEXT: v_mov_b32_e32 v6, v12
; GCN-O0-NEXT: s_setpc_b64 s[30:31]
%x = insertelement <7 x float> %arg, float %val, i32 %idx
ret <7 x float> %x
}