blob: c7c9e90e1967743902d2914b433a3de6855cb104 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX11 %s
; This test just checks that the compiler doesn't crash.
define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 {
; GCN-LABEL: v32i8_to_v8i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[0:1], 0x1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
; GCN-NEXT: ; return to shader part epilog
;
; VI-LABEL: v32i8_to_v8i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[0:1], 0x4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v32i8_to_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v32i8_to_v8i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0
; GFX11-NEXT: ; return to shader part epilog
entry:
%1 = load <32 x i8>, ptr addrspace(4) %0
%2 = bitcast <32 x i8> %1 to <8 x i32>
%3 = extractelement <8 x i32> %2, i32 1
%4 = icmp ne i32 %3, 0
%5 = select i1 %4, float 0.0, float 1.0
ret float %5
}
define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: i8ptr_v16i8ptr:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: i8ptr_v16i8ptr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i8ptr_v16i8ptr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: i8ptr_v16i8ptr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%0 = load <16 x i8>, ptr addrspace(1) %in
store <16 x i8> %0, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: f32_to_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: f32_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v2, s2, 1.0
; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f32_to_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0
; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: f32_to_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %in, align 4
%fadd32 = fadd float %load, 1.0
%bc = bitcast float %fadd32 to <2 x i16>
%add.bitcast = add <2 x i16> %bc, <i16 2, i16 2>
store <2 x i16> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2i16_to_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 2
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_or_b32 s4, s5, s4
; GCN-NEXT: s_add_i32 s4, s4, 0x20000
; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2i16_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s2, 0xffff0000
; VI-NEXT: s_add_i32 s2, s2, 2
; VI-NEXT: s_and_b32 s1, s2, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_add_i32 s0, s0, 0x20000
; VI-NEXT: v_add_f32_e64 v2, s0, 1.0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i16_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, s2, 2 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2i16_to_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, s2, 2 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(1) %in, align 4
%add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
%bc = bitcast <2 x i16> %add.v2i16 to float
%fadd.bitcast = fadd float %bc, 1.0
store float %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: f32_to_v2f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: f32_to_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v3, s2, 1.0
; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v3, 2.0, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f32_to_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0
; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: f32_to_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %in, align 4
%fadd32 = fadd float %load, 1.0
%bc = bitcast float %fadd32 to <2 x half>
%add.bitcast = fadd <2 x half> %bc, <half 2.0, half 2.0>
store <2 x half> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2f16_to_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
; GCN-NEXT: s_lshr_b32 s4, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2f16_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_e64 v3, s2, 2.0
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: v_add_f32_e32 v2, 1.0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2f16_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2f16_to_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x half>, ptr addrspace(1) %in, align 4
%add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
%bc = bitcast <2 x half> %add.v2f16 to float
%fadd.bitcast = fadd float %bc, 1.0
store float %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4i8_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4i8_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i8_to_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4i8_to_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(1) %in, align 4
%bc = bitcast <4 x i8> %load to i32
store i32 %bc, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: i32_to_v4i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: i32_to_v4i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: i32_to_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: i32_to_v4i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%bc = bitcast i32 %load to <4 x i8>
store <4 x i8> %bc, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: bitcast_v2i32_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s5, s5, 9
; GCN-NEXT: s_add_i32 s4, s4, 4
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v2i32_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s3, s3, 9
; VI-NEXT: s_add_i32 s2, s2, 4
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v2i32_to_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s3, s3, 9
; GFX9-NEXT: s_add_i32 s2, s2, 4
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v2i32_to_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s3, 9
; GFX11-NEXT: s_add_i32 s2, s2, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%val = load <2 x i32>, ptr addrspace(1) %in, align 8
%add = add <2 x i32> %val, <i32 4, i32 9>
%bc = bitcast <2 x i32> %add to double
%fadd.bc = fadd double %bc, 1.0
store double %fadd.bc, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: bitcast_f64_to_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 4.0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_f64_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_f64_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_f64_to_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %in, align 8
%add = fadd double %val, 4.0
%bc = bitcast double %add to <2 x i32>
store <2 x i32> %bc, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
; GCN-LABEL: bitcast_v2i64_to_v2f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s9, s[4:5], 0x9
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s9, 0
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_mov_b32 s10, s8
; GCN-NEXT: s_mov_b32 s11, s8
; GCN-NEXT: s_cbranch_scc1 .LBB10_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_mov_b32 s4, s2
; GCN-NEXT: s_mov_b32 s5, s3
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: .LBB10_2: ; %end
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v2i64_to_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s11, s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; VI-NEXT: s_mov_b32 s8, 0
; VI-NEXT: s_mov_b32 s9, s8
; VI-NEXT: s_mov_b32 s10, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s11, 0
; VI-NEXT: s_mov_b32 s11, s8
; VI-NEXT: s_cbranch_scc1 .LBB10_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
; VI-NEXT: s_mov_b64 s[8:9], s[4:5]
; VI-NEXT: .LBB10_2: ; %end
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v2i64_to_v2f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: s_cbranch_scc1 .LBB10_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX9-NEXT: .LBB10_2: ; %end
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v2i64_to_v2f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s9, s8
; GFX11-NEXT: s_mov_b32 s10, s8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: s_mov_b32 s11, s8
; GFX11-NEXT: s_cbranch_scc1 .LBB10_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX11-NEXT: .LBB10_2: ; %end
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_mov_b32_e32 v2, s10
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x i64> %value to <2 x double>
br label %end
end:
%phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
store <2 x double> %phi, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
; GCN-LABEL: bitcast_v2f64_to_v2i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s9, s[4:5], 0x9
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s9, 0
; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_mov_b32 s10, s8
; GCN-NEXT: s_mov_b32 s11, s8
; GCN-NEXT: s_cbranch_scc1 .LBB11_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_mov_b32 s4, s2
; GCN-NEXT: s_mov_b32 s5, s3
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: .LBB11_2: ; %end
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v2f64_to_v2i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s11, s[4:5], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; VI-NEXT: s_mov_b32 s8, 0
; VI-NEXT: s_mov_b32 s9, s8
; VI-NEXT: s_mov_b32 s10, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s11, 0
; VI-NEXT: s_mov_b32 s11, s8
; VI-NEXT: s_cbranch_scc1 .LBB11_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
; VI-NEXT: s_mov_b64 s[8:9], s[4:5]
; VI-NEXT: .LBB11_2: ; %end
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v2f64_to_v2i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: s_cbranch_scc1 .LBB11_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX9-NEXT: .LBB11_2: ; %end
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v2f64_to_v2i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s9, s8
; GFX11-NEXT: s_mov_b32 s10, s8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: s_mov_b32 s11, s8
; GFX11-NEXT: s_cbranch_scc1 .LBB11_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_mov_b32 s4, s2
; GFX11-NEXT: s_mov_b32 s5, s3
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5]
; GFX11-NEXT: .LBB11_2: ; %end
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_mov_b32_e32 v2, s10
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x double> %value to <2 x i64>
br label %end
end:
%phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
store <2 x i64> %phi, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4i16_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000
; GCN-NEXT: s_add_i32 s5, s5, 4
; GCN-NEXT: s_and_b32 s7, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 4
; GCN-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_or_b32 s5, s6, s5
; GCN-NEXT: s_or_b32 s4, s7, s4
; GCN-NEXT: s_add_i32 s5, s5, 0x40000
; GCN-NEXT: s_add_i32 s4, s4, 0x40000
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4i16_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s4, s2, 0xffff0000
; VI-NEXT: s_add_i32 s2, s2, 4
; VI-NEXT: s_and_b32 s5, s3, 0xffff0000
; VI-NEXT: s_add_i32 s3, s3, 4
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s3, s5, s3
; VI-NEXT: s_or_b32 s2, s4, s2
; VI-NEXT: s_add_i32 s3, s3, 0x40000
; VI-NEXT: s_add_i32 s2, s2, 0x40000
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i16_to_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4i16_to_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to double
%fadd.bitcast = fadd double %bc, 1.0
store double %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4f16_to_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
; GCN-NEXT: s_lshr_b32 s4, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
; GCN-NEXT: s_lshr_b32 s5, s5, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4f16_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: v_add_f16_e64 v1, s3, 4.0
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_add_f16_e64 v2, s2, 4.0
; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v1, v3
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4f16_to_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4f16_to_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to double
%fadd.bitcast = fadd double %bc, 1.0
store double %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: f64_to_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v1, v1, v2
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: f64_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 2.0, v1
; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v5
; VI-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f64_to_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: f64_to_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load double, ptr addrspace(1) %in, align 4
%fadd32 = fadd double %load, 1.0
%bc = bitcast double %fadd32 to <4 x half>
%add.bitcast = fadd <4 x half> %bc, <half 2.0, half 2.0, half 2.0, half 2.0>
store <4 x half> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: f64_to_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GCN-NEXT: v_add_i32_e32 v1, vcc, 2, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_or_b32_e32 v1, v2, v1
; GCN-NEXT: v_or_b32_e32 v0, v3, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x20000, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: f64_to_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x20000, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x20000, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: f64_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: f64_to_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load double, ptr addrspace(1) %in, align 4
%fadd32 = fadd double %load, 1.0
%bc = bitcast double %fadd32 to <4 x i16>
%add.bitcast = add <4 x i16> %bc, <i16 2, i16 2, i16 2, i16 2>
store <4 x i16> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4i16_to_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000
; GCN-NEXT: s_add_i32 s5, s5, 4
; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 4
; GCN-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_or_b32 s2, s2, s5
; GCN-NEXT: s_or_b32 s4, s6, s4
; GCN-NEXT: s_add_i32 s2, s2, 0x40000
; GCN-NEXT: s_add_i32 s4, s4, 0x40000
; GCN-NEXT: s_add_u32 s4, s4, 1
; GCN-NEXT: s_addc_u32 s5, s2, 0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4i16_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s2, 0xffff0000
; VI-NEXT: s_add_i32 s1, s2, 4
; VI-NEXT: s_and_b32 s2, s3, 0xffff0000
; VI-NEXT: s_add_i32 s3, s3, 4
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_add_i32 s2, s2, 0x40000
; VI-NEXT: s_add_i32 s0, s0, 0x40000
; VI-NEXT: s_add_u32 s0, s0, 1
; VI-NEXT: s_addc_u32 s1, s2, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i16_to_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4i16_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to i64
%add.bitcast = add i64 %bc, 1
store i64 %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4f16_to_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
; GCN-NEXT: s_lshr_b32 s4, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
; GCN-NEXT: s_lshr_b32 s5, s5, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4f16_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s3, 16
; VI-NEXT: s_lshr_b32 s1, s2, 16
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_add_f16_e64 v4, s2, 4.0
; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_e64 v3, s3, 4.0
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: v_or_b32_e32 v3, v3, v5
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4f16_to_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4f16_to_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to i64
%add.bitcast = add i64 %bc, 1
store i64 %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: bitcast_i64_to_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s2, s4, 4
; GCN-NEXT: s_addc_u32 s4, s5, 0
; GCN-NEXT: s_and_b32 s5, s2, 0xffff0000
; GCN-NEXT: s_add_i32 s2, s2, 1
; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 3
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_or_b32 s2, s5, s2
; GCN-NEXT: s_or_b32 s4, s6, s4
; GCN-NEXT: s_add_i32 s5, s2, 0x20000
; GCN-NEXT: s_add_i32 s4, s4, 0x40000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_i64_to_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s2, 4
; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: s_and_b32 s2, s0, 0xffff0000
; VI-NEXT: s_add_i32 s0, s0, 1
; VI-NEXT: s_and_b32 s3, s1, 0xffff0000
; VI-NEXT: s_add_i32 s1, s1, 3
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s0, s2, s0
; VI-NEXT: s_or_b32 s1, s3, s1
; VI-NEXT: s_add_i32 s0, s0, 0x20000
; VI-NEXT: s_add_i32 s1, s1, 0x40000
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_i64_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: v_pk_add_u16 v1, s3, v0
; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_i64_to_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%add = add i64 %val, 4
%bc = bitcast i64 %add to <4 x i16>
%add.v4i16 = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
store <4 x i16> %add.v4i16, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: bitcast_i64_to_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s4, 4
; GCN-NEXT: s_addc_u32 s5, s5, 0
; GCN-NEXT: s_lshr_b32 s6, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
; GCN-NEXT: s_lshr_b32 s4, s5, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s6
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GCN-NEXT: v_or_b32_e32 v1, v2, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_i64_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4800
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s2, 4
; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: s_lshr_b32 s3, s1, 16
; VI-NEXT: s_lshr_b32 s2, s0, 16
; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: v_add_f16_e64 v4, s1, 4.0
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_sdwa v5, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, v4, v2
; VI-NEXT: v_add_f16_e64 v2, s0, 1.0
; VI-NEXT: v_or_b32_e32 v2, v2, v5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_i64_to_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400
; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: v_pk_add_f16 v1, s3, v0
; GFX9-NEXT: v_pk_add_f16 v0, s2, v3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_i64_to_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%add = add i64 %val, 4
%bc = bitcast i64 %add to <4 x half>
%add.v4i16 = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
store <4 x half> %add.v4i16, ptr addrspace(1) %out, align 8
ret void
}
define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4i16_to_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 4
; GCN-NEXT: s_and_b32 s7, s5, 0xffff0000
; GCN-NEXT: s_add_i32 s5, s5, 4
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NEXT: s_or_b32 s4, s6, s4
; GCN-NEXT: s_or_b32 s5, s7, s5
; GCN-NEXT: s_add_i32 s4, s4, 0x40000
; GCN-NEXT: s_add_i32 s5, s5, 0x40000
; GCN-NEXT: v_add_f32_e64 v1, s5, 1.0
; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4i16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s3, 0xffff0000
; VI-NEXT: s_add_i32 s1, s3, 4
; VI-NEXT: s_and_b32 s3, s2, 0xffff0000
; VI-NEXT: s_add_i32 s2, s2, 4
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_add_i32 s2, s2, 0x40000
; VI-NEXT: s_add_i32 s0, s0, 0x40000
; VI-NEXT: v_add_f32_e64 v3, s0, 1.0
; VI-NEXT: v_add_f32_e64 v2, s2, 1.0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i16_to_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4i16_to_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to <2 x float>
%fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
store <2 x float> %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4f16_to_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5
; GCN-NEXT: s_lshr_b32 s5, s5, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
; GCN-NEXT: s_lshr_b32 s4, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v3, v1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v0
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v3
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4f16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s2, 16
; VI-NEXT: s_lshr_b32 s1, s3, 16
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_add_f16_e64 v3, s2, 4.0
; VI-NEXT: v_add_f16_e64 v4, s3, 4.0
; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v5, v3, v5
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: v_add_f32_e32 v3, 1.0, v2
; VI-NEXT: v_add_f32_e32 v2, 1.0, v5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4f16_to_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4f16_to_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to <2 x float>
%fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
store <2 x float> %fadd.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2f32_to_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e64 v0, s4, 2.0
; GCN-NEXT: v_add_f32_e64 v1, s5, 4.0
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_or_b32_e32 v1, v2, v1
; GCN-NEXT: v_or_b32_e32 v0, v3, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x40000, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2f32_to_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v2, s3, 4.0
; VI-NEXT: v_add_f32_e64 v3, s2, 2.0
; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x40000, v2
; VI-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2f32_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s2, 0x40003
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0
; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0
; GFX9-NEXT: v_pk_add_u16 v1, v1, s2
; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2f32_to_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0
; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, v0
; GFX11-NEXT: v_pk_sub_u16 v0, v2, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x float>, ptr addrspace(1) %in, align 4
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
%bc = bitcast <2 x float> %add.v2f32 to <4 x i16>
%add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
store <4 x i16> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2f32_to_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e64 v0, s5, 4.0
; GCN-NEXT: v_add_f32_e64 v1, s4, 2.0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_f32_e32 v2, 0x41000000, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v1, v0, v1
; GCN-NEXT: v_or_b32_e32 v0, v4, v2
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2f32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4800
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v4, s2, 2.0
; VI-NEXT: v_add_f32_e64 v5, s3, 4.0
; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v5, 4.0, v5
; VI-NEXT: v_add_f16_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v4, 1.0, v4
; VI-NEXT: v_or_b32_e32 v3, v5, v2
; VI-NEXT: v_or_b32_e32 v2, v4, v6
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2f32_to_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s2, 0x48004400
; GFX9-NEXT: s_mov_b32 s3, 0x40003c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0
; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0
; GFX9-NEXT: v_pk_add_f16 v1, v1, s2
; GFX9-NEXT: v_pk_add_f16 v0, v0, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2f32_to_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0
; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, v0
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v2
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x float>, ptr addrspace(1) %in, align 4
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
%bc = bitcast <2 x float> %add.v2f32 to <4 x half>
%add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
store <4 x half> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4i16_to_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s2, s4, 0xffff0000
; GCN-NEXT: s_add_i32 s4, s4, 4
; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000
; GCN-NEXT: s_add_i32 s5, s5, 4
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NEXT: s_or_b32 s2, s2, s4
; GCN-NEXT: s_or_b32 s4, s6, s5
; GCN-NEXT: s_add_i32 s4, s4, 0x40001
; GCN-NEXT: s_add_i32 s5, s2, 0x40001
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4i16_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s3, 0xffff0000
; VI-NEXT: s_add_i32 s1, s3, 4
; VI-NEXT: s_and_b32 s3, s2, 0xffff0000
; VI-NEXT: s_add_i32 s2, s2, 4
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s1, s1, 0xffff
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_add_i32 s0, s0, 0x40001
; VI-NEXT: s_add_i32 s2, s2, 0x40001
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4i16_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4i16_to_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0
; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to <2 x i32>
%add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
store <2 x i32> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v4f16_to_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5
; GCN-NEXT: s_lshr_b32 s5, s5, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
; GCN-NEXT: s_lshr_b32 s4, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v3, v1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v3
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v4f16_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s2, 16
; VI-NEXT: s_lshr_b32 s1, s3, 16
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_add_f16_e64 v3, s2, 4.0
; VI-NEXT: v_add_f16_e64 v4, s3, 4.0
; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v5, v3, v5
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v4f16_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v4f16_to_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0
; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to <2 x i32>
%add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
store <2 x i32> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2i32_to_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s2, s4, 2
; GCN-NEXT: s_add_i32 s6, s5, 4
; GCN-NEXT: s_add_i32 s5, s5, 7
; GCN-NEXT: s_add_i32 s4, s4, 3
; GCN-NEXT: s_and_b32 s5, s5, 0xffff
; GCN-NEXT: s_and_b32 s6, s6, 0xffff0000
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_and_b32 s2, s2, 0xffff0000
; GCN-NEXT: s_or_b32 s5, s6, s5
; GCN-NEXT: s_or_b32 s2, s2, s4
; GCN-NEXT: s_add_i32 s5, s5, 0x40000
; GCN-NEXT: s_add_i32 s4, s2, 0x20000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2i32_to_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s0, s3, 4
; VI-NEXT: s_add_i32 s1, s2, 2
; VI-NEXT: s_add_i32 s2, s2, 3
; VI-NEXT: s_add_i32 s3, s3, 7
; VI-NEXT: s_and_b32 s1, s1, 0xffff0000
; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s0, s0, s3
; VI-NEXT: s_or_b32 s1, s1, s2
; VI-NEXT: s_add_i32 s0, s0, 0x40000
; VI-NEXT: s_add_i32 s1, s1, 0x20000
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i32_to_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s4, 2
; GFX9-NEXT: s_add_i32 s3, s5, 4
; GFX9-NEXT: v_pk_add_u16 v1, s3, v0
; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2i32_to_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s3, 4
; GFX11-NEXT: s_add_i32 s2, s2, 2
; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3
; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x i32>, ptr addrspace(1) %in, align 4
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
%bc = bitcast <2 x i32> %add.v2i32 to <4 x i16>
%add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
store <4 x i16> %add.bitcast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; GCN-LABEL: v2i32_to_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s5, s5, 4
; GCN-NEXT: s_add_i32 s4, s4, 2
; GCN-NEXT: s_lshr_b32 s6, s5, 16
; GCN-NEXT: s_lshr_b32 s7, s4, 16
; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
; GCN-NEXT: v_cvt_f32_f16_e32 v2, s7
; GCN-NEXT: v_cvt_f32_f16_e32 v3, s6
; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: v2i32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4800
; VI-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s1, s3, 4
; VI-NEXT: s_add_i32 s0, s2, 2
; VI-NEXT: s_lshr_b32 s2, s1, 16
; VI-NEXT: v_add_f16_e64 v3, s1, 4.0
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, v3, v2
; VI-NEXT: v_add_f16_sdwa v2, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_add_f16_e64 v4, s0, 1.0
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v2i32_to_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400
; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s4, 2
; GFX9-NEXT: s_add_i32 s3, s5, 4
; GFX9-NEXT: v_pk_add_f16 v1, s3, v0
; GFX9-NEXT: v_pk_add_f16 v0, s2, v3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v2i32_to_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s3, s3, 4
; GFX11-NEXT: s_add_i32 s2, s2, 2
; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <2 x i32>, ptr addrspace(1) %in, align 4
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
%bc = bitcast <2 x i32> %add.v2i32 to <4 x half>
%add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
store <4 x half> %add.bitcast, ptr addrspace(1) %out
ret void
}
declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg)
define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
; GCN-LABEL: bitcast_v4f32_to_v2i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, v1
; GCN-NEXT: v_mov_b32_e32 v4, v0
; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_or_b32_e32 v1, s9, v5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB28_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v5
; GCN-NEXT: s_mov_b32 s4, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc
; GCN-NEXT: v_mov_b32_e32 v8, s9
; GCN-NEXT: v_fma_f32 v0, v1, s4, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_fma_f32 v0, v1, s5, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_mul_lo_u32 v9, v6, v1
; GCN-NEXT: v_mul_lo_u32 v10, v7, v0
; GCN-NEXT: v_mul_hi_u32 v11, v6, v0
; GCN-NEXT: v_mul_lo_u32 v12, v6, v0
; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GCN-NEXT: v_mul_hi_u32 v11, v0, v12
; GCN-NEXT: v_mul_hi_u32 v13, v1, v12
; GCN-NEXT: v_mul_lo_u32 v12, v1, v12
; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GCN-NEXT: v_mul_hi_u32 v10, v0, v9
; GCN-NEXT: v_mul_lo_u32 v14, v0, v9
; GCN-NEXT: v_mul_hi_u32 v15, v1, v9
; GCN-NEXT: v_mul_lo_u32 v9, v1, v9
; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc
; GCN-NEXT: v_mul_hi_u32 v9, v6, v0
; GCN-NEXT: v_mul_lo_u32 v7, v7, v0
; GCN-NEXT: v_mul_lo_u32 v10, v6, v0
; GCN-NEXT: v_mul_lo_u32 v6, v6, v1
; GCN-NEXT: v_mul_hi_u32 v11, v1, v10
; GCN-NEXT: v_mul_lo_u32 v12, v1, v10
; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GCN-NEXT: v_mul_hi_u32 v7, v1, v6
; GCN-NEXT: v_mul_hi_u32 v9, v0, v6
; GCN-NEXT: v_mul_lo_u32 v13, v0, v6
; GCN-NEXT: v_mul_lo_u32 v6, v1, v6
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
; GCN-NEXT: v_mul_hi_u32 v6, s8, v0
; GCN-NEXT: v_mul_hi_u32 v7, s9, v0
; GCN-NEXT: v_mul_lo_u32 v0, s9, v0
; GCN-NEXT: v_mul_hi_u32 v9, s8, v1
; GCN-NEXT: v_mul_lo_u32 v10, s8, v1
; GCN-NEXT: v_mul_hi_u32 v11, s9, v1
; GCN-NEXT: v_mul_lo_u32 v1, s9, v1
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, v9, v7, vcc
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
; GCN-NEXT: v_mul_hi_u32 v6, v4, v0
; GCN-NEXT: v_mul_lo_u32 v7, v5, v0
; GCN-NEXT: v_mul_lo_u32 v9, v4, v0
; GCN-NEXT: v_mul_lo_u32 v10, v4, v1
; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v0
; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v0
; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, s9, v6
; GCN-NEXT: v_sub_i32_e32 v9, vcc, s8, v9
; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v5, vcc
; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v4
; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5
; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
; GCN-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v4, v13, v11, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN-NEXT: .LBB28_2: ; %Flow1
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB28_4
; GCN-NEXT: ; %bb.3:
; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
; GCN-NEXT: v_mul_lo_u32 v1, v0, v4
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1
; GCN-NEXT: v_sub_i32_e32 v6, vcc, v1, v4
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: .LBB28_4:
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_or_b32_e32 v5, s11, v3
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB28_6
; GCN-NEXT: ; %bb.5:
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
; GCN-NEXT: s_mov_b32 s4, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
; GCN-NEXT: v_mov_b32_e32 v8, s11
; GCN-NEXT: v_fma_f32 v4, v5, s4, v4
; GCN-NEXT: v_rcp_f32_e32 v4, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GCN-NEXT: v_trunc_f32_e32 v5, v5
; GCN-NEXT: v_fma_f32 v4, v5, s5, v4
; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
; GCN-NEXT: v_mul_lo_u32 v9, v6, v5
; GCN-NEXT: v_mul_lo_u32 v10, v7, v4
; GCN-NEXT: v_mul_hi_u32 v11, v6, v4
; GCN-NEXT: v_mul_lo_u32 v12, v6, v4
; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GCN-NEXT: v_mul_hi_u32 v11, v4, v12
; GCN-NEXT: v_mul_hi_u32 v13, v5, v12
; GCN-NEXT: v_mul_lo_u32 v12, v5, v12
; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GCN-NEXT: v_mul_hi_u32 v10, v4, v9
; GCN-NEXT: v_mul_lo_u32 v14, v4, v9
; GCN-NEXT: v_mul_hi_u32 v15, v5, v9
; GCN-NEXT: v_mul_lo_u32 v9, v5, v9
; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v10, vcc
; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
; GCN-NEXT: v_mul_lo_u32 v10, v6, v4
; GCN-NEXT: v_mul_lo_u32 v6, v6, v5
; GCN-NEXT: v_mul_hi_u32 v11, v5, v10
; GCN-NEXT: v_mul_lo_u32 v12, v5, v10
; GCN-NEXT: v_mul_hi_u32 v10, v4, v10
; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GCN-NEXT: v_mul_hi_u32 v7, v5, v6
; GCN-NEXT: v_mul_hi_u32 v9, v4, v6
; GCN-NEXT: v_mul_lo_u32 v13, v4, v6
; GCN-NEXT: v_mul_lo_u32 v6, v5, v6
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
; GCN-NEXT: v_mul_hi_u32 v6, s10, v4
; GCN-NEXT: v_mul_hi_u32 v7, s11, v4
; GCN-NEXT: v_mul_lo_u32 v4, s11, v4
; GCN-NEXT: v_mul_hi_u32 v9, s10, v5
; GCN-NEXT: v_mul_lo_u32 v10, s10, v5
; GCN-NEXT: v_mul_hi_u32 v11, s11, v5
; GCN-NEXT: v_mul_lo_u32 v5, s11, v5
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
; GCN-NEXT: v_mul_lo_u32 v7, v3, v4
; GCN-NEXT: v_mul_lo_u32 v9, v2, v4
; GCN-NEXT: v_mul_lo_u32 v10, v2, v5
; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v4
; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc
; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v4
; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GCN-NEXT: v_sub_i32_e32 v7, vcc, s11, v6
; GCN-NEXT: v_sub_i32_e32 v9, vcc, s10, v9
; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v3, vcc
; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2
; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v2
; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5]
; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN-NEXT: .LBB28_6: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB28_8
; GCN-NEXT: ; %bb.7:
; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
; GCN-NEXT: v_mul_lo_u32 v3, v3, v4
; GCN-NEXT: v_mul_hi_u32 v3, v4, v3
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_mul_hi_u32 v3, s10, v3
; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4
; GCN-NEXT: v_sub_i32_e32 v6, vcc, v4, v2
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; GCN-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: .LBB28_8:
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v4
; GCN-NEXT: v_mov_b32_e32 v3, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_v4f32_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
; VI-NEXT: v_mov_b32_e32 v5, v1
; VI-NEXT: v_mov_b32_e32 v4, v0
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_or_b32_e32 v1, s9, v5
; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB28_2
; VI-NEXT: ; %bb.1:
; VI-NEXT: v_cvt_f32_u32_e32 v0, v4
; VI-NEXT: v_cvt_f32_u32_e32 v1, v5
; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; VI-NEXT: v_rcp_f32_e32 v0, v0
; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_trunc_f32_e32 v1, v1
; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; VI-NEXT: v_cvt_u32_f32_e32 v8, v1
; VI-NEXT: v_cvt_u32_f32_e32 v9, v0
; VI-NEXT: v_mul_lo_u32 v6, v10, v8
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
; VI-NEXT: v_mul_lo_u32 v7, v11, v9
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v6
; VI-NEXT: v_add_u32_e32 v13, vcc, v1, v7
; VI-NEXT: v_mul_hi_u32 v12, v9, v0
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v13, 0
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6
; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v7, vcc
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v13, 0
; VI-NEXT: v_add_u32_e32 v0, vcc, v12, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, v14, v1, vcc
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v0
; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v1, vcc
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0
; VI-NEXT: v_mul_lo_u32 v8, v10, v13
; VI-NEXT: v_mul_lo_u32 v9, v11, v12
; VI-NEXT: v_mul_hi_u32 v10, v12, v0
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v0, 0
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9
; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v1, 0
; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v1, vcc
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0
; VI-NEXT: v_mul_hi_u32 v8, s8, v6
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0
; VI-NEXT: v_add_u32_e32 v0, vcc, v8, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, v9, v1, vcc
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v0, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: v_mul_lo_u32 v8, v4, v7
; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
; VI-NEXT: v_mul_lo_u32 v9, v5, v6
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9
; VI-NEXT: v_sub_u32_e32 v8, vcc, s9, v1
; VI-NEXT: v_sub_u32_e32 v0, vcc, s8, v0
; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v5, vcc
; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v0, v4
; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6
; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5]
; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6
; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5]
; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
; VI-NEXT: v_mov_b32_e32 v10, s9
; VI-NEXT: v_subb_u32_e32 v1, vcc, v10, v1, vcc
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; VI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5]
; VI-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; VI-NEXT: ; implicit-def: $vgpr4_vgpr5
; VI-NEXT: .LBB28_2: ; %Flow1
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; VI-NEXT: s_cbranch_execz .LBB28_4
; VI-NEXT: ; %bb.3:
; VI-NEXT: v_cvt_f32_u32_e32 v0, v4
; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v4
; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0
; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: v_mul_lo_u32 v1, v1, v0
; VI-NEXT: v_mul_hi_u32 v1, v0, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_mul_hi_u32 v0, s8, v0
; VI-NEXT: v_mul_lo_u32 v1, v0, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, s8, v1
; VI-NEXT: v_sub_u32_e32 v6, vcc, v1, v4
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: .LBB28_4:
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_or_b32_e32 v5, s11, v3
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; VI-NEXT: ; implicit-def: $vgpr4_vgpr5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB28_6
; VI-NEXT: ; %bb.5:
; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v2
; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
; VI-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; VI-NEXT: v_rcp_f32_e32 v4, v4
; VI-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; VI-NEXT: v_trunc_f32_e32 v5, v5
; VI-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
; VI-NEXT: v_cvt_u32_f32_e32 v8, v5
; VI-NEXT: v_cvt_u32_f32_e32 v9, v4
; VI-NEXT: v_mul_lo_u32 v6, v10, v8
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
; VI-NEXT: v_mul_lo_u32 v7, v11, v9
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; VI-NEXT: v_add_u32_e32 v7, vcc, v5, v7
; VI-NEXT: v_mul_hi_u32 v12, v9, v4
; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v5
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
; VI-NEXT: v_add_u32_e32 v4, vcc, v12, v4
; VI-NEXT: v_addc_u32_e32 v4, vcc, v13, v5, vcc
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v4
; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v5, vcc
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
; VI-NEXT: v_mul_lo_u32 v8, v10, v13
; VI-NEXT: v_mul_lo_u32 v9, v11, v12
; VI-NEXT: v_mul_hi_u32 v10, v12, v4
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v4, 0
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9
; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, 0
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v5, 0
; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v4
; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v5, vcc
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
; VI-NEXT: v_mul_hi_u32 v8, s10, v6
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0
; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0
; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4
; VI-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; VI-NEXT: v_mul_lo_u32 v8, v2, v7
; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
; VI-NEXT: v_mul_lo_u32 v9, v3, v6
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9
; VI-NEXT: v_sub_u32_e32 v8, vcc, s11, v5
; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4
; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v3, vcc
; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v4, v2
; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6
; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5]
; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6
; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5]
; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
; VI-NEXT: v_mov_b32_e32 v10, s11
; VI-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5]
; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
; VI-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc
; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
; VI-NEXT: .LBB28_6: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; VI-NEXT: s_cbranch_execz .LBB28_8
; VI-NEXT: ; %bb.7:
; VI-NEXT: v_cvt_f32_u32_e32 v3, v2
; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
; VI-NEXT: v_rcp_iflag_f32_e32 v3, v3
; VI-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; VI-NEXT: v_cvt_u32_f32_e32 v3, v3
; VI-NEXT: v_mul_lo_u32 v4, v4, v3
; VI-NEXT: v_mul_hi_u32 v4, v3, v4
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; VI-NEXT: v_mul_hi_u32 v3, s10, v3
; VI-NEXT: v_mul_lo_u32 v4, v3, v2
; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4
; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v2
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; VI-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: .LBB28_8:
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v2, v4
; VI-NEXT: v_mov_b32_e32 v3, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v4f32_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v5, v1
; GFX9-NEXT: v_mov_b32_e32 v4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v1, s9, v5
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB28_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v5
; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v4
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v5, vcc
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v0
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
; GFX9-NEXT: v_add3_u32 v12, v1, v6, v7
; GFX9-NEXT: v_mul_hi_u32 v1, v9, v0
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v12, 0
; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v1, v6
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v12, 0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v14, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v0
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0
; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0
; GFX9-NEXT: v_mul_hi_u32 v10, v12, v0
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v1, 0
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v0, 0
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v0
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v8, s8, v6
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v8, v5, v6
; GFX9-NEXT: v_mul_lo_u32 v9, v4, v7
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8
; GFX9-NEXT: v_sub_u32_e32 v8, s9, v1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, vcc
; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v4
; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6
; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6
; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v10, s9
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: .LBB28_2: ; %Flow1
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_cbranch_execz .LBB28_4
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4
; GFX9-NEXT: v_sub_u32_e32 v1, 0, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v4
; GFX9-NEXT: v_add_u32_e32 v5, 1, v0
; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1
; GFX9-NEXT: v_sub_u32_e32 v6, v1, v4
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX9-NEXT: v_add_u32_e32 v5, 1, v0
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: .LBB28_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_or_b32_e32 v5, s11, v3
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB28_6
; GFX9-NEXT: ; %bb.5:
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; GFX9-NEXT: v_rcp_f32_e32 v4, v4
; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6
; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8
; GFX9-NEXT: v_sub_u32_e32 v8, s11, v5
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s10, v4
; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6
; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6
; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v10, s11
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v10, v5, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: .LBB28_6: ; %Flow
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_cbranch_execz .LBB28_8
; GFX9-NEXT: ; %bb.7:
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3
; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2
; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
; GFX9-NEXT: v_sub_u32_e32 v4, s10, v4
; GFX9-NEXT: v_sub_u32_e32 v6, v4, v2
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: .LBB28_8:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v2, v4
; GFX9-NEXT: v_mov_b32_e32 v3, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: bitcast_v4f32_to_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x0
; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v1, s5, v5
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB28_2
; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v5
; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, 0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, null, 0, v5, vcc_lo
; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v1, v1
; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0
; GFX11-NEXT: v_mul_lo_u32 v6, v10, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
; GFX11-NEXT: v_add3_u32 v14, v1, v6, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v15, v13, v0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v14, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v14, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v15, v6
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, v12, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
; GFX11-NEXT: v_mul_lo_u32 v7, v10, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v11, v13, v0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
; GFX11-NEXT: v_add3_u32 v10, v1, v7, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v11, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v13, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v12, v1, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v11, s4, v8
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s5, v8, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s4, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s5, v10, 0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v11, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v1, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v5, v6
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v9, v4, v7
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1
; GFX11-NEXT: v_sub_co_u32 v11, s0, v0, v4
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, v1, vcc_lo
; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v4
; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v5
; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v4
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-NEXT: v_dual_cndmask_b32 v1, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v1 :: v_dual_cndmask_b32 v0, v6, v4
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB28_2: ; %Flow1
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1
; GFX11-NEXT: s_cbranch_execz .LBB28_4
; GFX11-NEXT: ; %bb.3:
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v0
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v1, v0, v4
; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v6, v1, v4
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4
; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_cndmask_b32 v0, v0, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX11-NEXT: .LBB28_4:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_or_b32_e32 v5, s7, v3
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB28_6
; GFX11-NEXT: ; %bb.5:
; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v2
; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3
; GFX11-NEXT: v_sub_co_u32 v11, vcc_lo, 0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_co_ci_u32_e64 v12, null, 0, v3, vcc_lo
; GFX11-NEXT: v_fmamk_f32 v4, v5, 0x4f800000, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX11-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v5, v5
; GFX11-NEXT: v_fmamk_f32 v4, v5, 0xcf800000, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4
; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v7, v12, v14
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
; GFX11-NEXT: v_add3_u32 v15, v5, v6, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v16, v14, v4
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v15, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v15, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v5
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, v4
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_lo_u32 v6, v12, v14
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_hi_u32 v12, v14, v4
; GFX11-NEXT: v_add3_u32 v11, v5, v7, v6
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v11, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v11, 0
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v13, v5, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v11, s6, v8
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s7, v8, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, s6, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s7, v10, 0
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v11, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v4, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v3, v6
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v9, v2, v7
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v9, v8
; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5
; GFX11-NEXT: v_sub_co_u32 v11, s0, v4, v2
; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, s7, v5, vcc_lo
; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v3, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v2
; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v3
; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v3
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0
; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-NEXT: v_dual_cndmask_b32 v3, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v3 :: v_dual_cndmask_b32 v4, v6, v4
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: .LBB28_6: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1
; GFX11-NEXT: s_cbranch_execz .LBB28_8
; GFX11-NEXT: ; %bb.7:
; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v2
; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4
; GFX11-NEXT: v_mul_hi_u32 v3, s6, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v4, v3, v2
; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3
; GFX11-NEXT: v_sub_nc_u32_e32 v4, s6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v6, v4, v2
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v5 :: v_dual_mov_b32 v5, 0
; GFX11-NEXT: .LBB28_8:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> poison, i32 0, i32 0)
%cast = bitcast <4 x float> %val to <2 x i64>
%div = udiv <2 x i64> %cast, %arg
ret <2 x i64> %div
}
declare half @llvm.canonicalize.f16(half)
define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) {
; GCN-LABEL: bitcast_f32_to_v1i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x387c0000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_f32_to_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x387c0000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_f32_to_v1i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x387c0000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_f32_to_v1i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x387c0000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
%f16 = call arcp afn half @llvm.canonicalize.f16(half 0xH03F0)
%f32 = fpext half %f16 to float
%v = bitcast float %f32 to <1 x i32>
%v1 = extractelement <1 x i32> %v, i32 0
store i32 %v1, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
; GCN-LABEL: bitcast_v4i64_to_v16i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v4i64_to_v16i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v4i64_to_v16i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v4i64_to_v16i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <4 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <4 x i64> %phi_value to <16 x i16>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if]
store <16 x i16> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
; GCN-LABEL: bitcast_v4f64_to_v16f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s11, 0
; GCN-NEXT: s_mov_b32 s18, 0
; GCN-NEXT: s_mov_b32 s15, 0
; GCN-NEXT: s_mov_b32 s19, 0
; GCN-NEXT: s_mov_b32 s16, 0
; GCN-NEXT: s_mov_b32 s20, 0
; GCN-NEXT: s_mov_b32 s17, 0
; GCN-NEXT: s_mov_b32 s21, 0
; GCN-NEXT: s_mov_b32 s8, 0
; GCN-NEXT: s_mov_b32 s12, 0
; GCN-NEXT: s_mov_b32 s9, 0
; GCN-NEXT: s_mov_b32 s13, 0
; GCN-NEXT: s_mov_b32 s10, 0
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s7, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, s18
; GCN-NEXT: v_cvt_f16_f32_e32 v1, s11
; GCN-NEXT: v_cvt_f16_f32_e32 v2, s19
; GCN-NEXT: v_cvt_f16_f32_e32 v3, s15
; GCN-NEXT: v_cvt_f16_f32_e32 v4, s20
; GCN-NEXT: v_cvt_f16_f32_e32 v5, s16
; GCN-NEXT: v_cvt_f16_f32_e32 v6, s21
; GCN-NEXT: v_cvt_f16_f32_e32 v7, s17
; GCN-NEXT: v_cvt_f16_f32_e32 v8, s12
; GCN-NEXT: v_cvt_f16_f32_e32 v9, s8
; GCN-NEXT: v_cvt_f16_f32_e32 v10, s13
; GCN-NEXT: v_cvt_f16_f32_e32 v11, s9
; GCN-NEXT: v_cvt_f16_f32_e32 v12, s14
; GCN-NEXT: v_cvt_f16_f32_e32 v13, s10
; GCN-NEXT: v_cvt_f16_f32_e32 v14, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v15, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_or_b32_e32 v1, v3, v2
; GCN-NEXT: v_or_b32_e32 v2, v5, v4
; GCN-NEXT: v_or_b32_e32 v3, v7, v6
; GCN-NEXT: v_or_b32_e32 v4, v9, v8
; GCN-NEXT: v_or_b32_e32 v5, v11, v10
; GCN-NEXT: v_or_b32_e32 v6, v13, v12
; GCN-NEXT: v_or_b32_e32 v7, v15, v14
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v4f64_to_v16f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v4f64_to_v16f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v4f64_to_v16f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <4 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <4 x double> %phi_value to <16 x half>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <16 x half> [zeroinitializer, %entry], [%cast, %if]
store <16 x half> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x i16> %value) {
; GCN-LABEL: bitcast_v16i16_to_v4i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v16i16_to_v4i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v16i16_to_v4i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v16i16_to_v4i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <16 x i16> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <16 x i16> %phi_value to <4 x i64>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if]
store <4 x i64> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x half> %value) {
; GCN-LABEL: bitcast_v16f16_to_v4f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v16f16_to_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v16f16_to_v4f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v16f16_to_v4f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_mov_b32_e32 v6, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <16 x half> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <16 x half> %phi_value to <4 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <4 x double> [zeroinitializer, %entry], [%cast, %if]
store <4 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) {
; GCN-LABEL: bitcast_v20f16_to_v5f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v20f16_to_v5f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v20f16_to_v5f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v20f16_to_v5f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <20 x half> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <20 x half> %phi_value to <5 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
store <5 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
; GCN-LABEL: bitcast_v10f32_to_v5f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v10f32_to_v5f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v10f32_to_v5f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v10f32_to_v5f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <10 x float> %phi_value to <5 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
store <5 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
; GCN-LABEL: bitcast_v10i32_to_v5f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v10i32_to_v5f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v10i32_to_v5f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v10i32_to_v5f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <10 x i32> %phi_value to <5 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
store <5 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
; GCN-LABEL: bitcast_v10f32_to_v5i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v10f32_to_v5i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v10f32_to_v5i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v10f32_to_v5i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <10 x float> %phi_value to <5 x i64>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
store <5 x i64> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
; GCN-LABEL: bitcast_v10i32_to_v5i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v10i32_to_v5i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v10i32_to_v5i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v10i32_to_v5i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <10 x i32> %phi_value to <5 x i64>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
store <5 x i64> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
; GCN-LABEL: bitcast_v40i8_to_v5f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v40i8_to_v5f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v40i8_to_v5f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v40i8_to_v5f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <40 x i8> %phi_value to <5 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if]
store <5 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
; GCN-LABEL: bitcast_v40i8_to_v5i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v40i8_to_v5i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v40i8_to_v5i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v40i8_to_v5i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <40 x i8> %phi_value to <5 x i64>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if]
store <5 x i64> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
; GCN-LABEL: bitcast_v5f64_to_v10f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v5f64_to_v10f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s7, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s7, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v5f64_to_v10f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v5f64_to_v10f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s7, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <5 x double> %phi_value to <10 x float>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
store <10 x float> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
; GCN-LABEL: bitcast_v5f64_to_v10i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v5f64_to_v10i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s7, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s7, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v5f64_to_v10i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v5f64_to_v10i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s7, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <5 x double> %phi_value to <10 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
store <10 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
; GCN-LABEL: bitcast_v5i64_to_v10f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v5i64_to_v10f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s7, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s7, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v5i64_to_v10f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v5i64_to_v10f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s7, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <5 x i64> %phi_value to <10 x float>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if]
store <10 x float> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
; GCN-LABEL: bitcast_v5i64_to_v10i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s15, 0xf000
; GCN-NEXT: s_mov_b32 s14, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v5i64_to_v10i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s7, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s7, 0
; VI-NEXT: s_add_u32 s8, s4, 16
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v5i64_to_v10i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v5i64_to_v10i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s7, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <5 x i64> %phi_value to <10 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if]
store <10 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
; GCN-LABEL: bitcast_v6f64_to_v12i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v6f64_to_v12i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v6f64_to_v12i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v6f64_to_v12i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <6 x double> %phi_value to <12 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
store <12 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
; GCN-LABEL: bitcast_v6f64_to_v12f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v6f64_to_v12f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v6f64_to_v12f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v6f64_to_v12f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <6 x double> %phi_value to <12 x float>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <12 x float> [zeroinitializer, %entry], [%cast, %if]
store <12 x float> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
; GCN-LABEL: bitcast_v12i32_to_v6i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v12i32_to_v6i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v12i32_to_v6i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v12i32_to_v6i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <12 x i32> %phi_value to <6 x i64>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <6 x i64> [zeroinitializer, %entry], [%cast, %if]
store <6 x i64> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
; GCN-LABEL: bitcast_v12i32_to_v6f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v12i32_to_v6f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v12i32_to_v6f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v12i32_to_v6f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <12 x i32> %phi_value to <6 x double>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <6 x double> [zeroinitializer, %entry], [%cast, %if]
store <6 x double> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) {
; GCN-LABEL: bitcast_v6i64_to_v12i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v6i64_to_v12i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s9, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s9, 0
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v6i64_to_v12i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v6i64_to_v12i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <6 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <6 x i64> %phi_value to <12 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if]
store <12 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) {
; GCN-LABEL: bitcast_v7i64_to_v14i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v12, s0
; GCN-NEXT: v_mov_b32_e32 v13, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v7i64_to_v14i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s12, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s13, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s8, s4, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v7i64_to_v14i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v7i64_to_v14i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <7 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <7 x i64> %phi_value to <14 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
store <14 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) {
; GCN-LABEL: bitcast_v7f64_to_v14i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s19, 0xf000
; GCN-NEXT: s_mov_b32 s18, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s0
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: v_mov_b32_e32 v6, s0
; GCN-NEXT: v_mov_b32_e32 v7, s0
; GCN-NEXT: v_mov_b32_e32 v12, s0
; GCN-NEXT: v_mov_b32_e32 v13, s0
; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: v_mov_b32_e32 v9, s0
; GCN-NEXT: v_mov_b32_e32 v10, s0
; GCN-NEXT: v_mov_b32_e32 v11, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v7f64_to_v14i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s12, s4, 16
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s13, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s8, s4, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s9, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: s_add_u32 s0, s4, 32
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v7f64_to_v14i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v7f64_to_v14i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5]
; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <7 x double> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <7 x double> %phi_value to <14 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if]
store <14 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) {
; GCN-LABEL: bitcast_v9i64_to_v18i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:64
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v9i64_to_v18i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s16, s4, 48
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s17, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: s_add_u32 s12, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s13, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 64
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v9i64_to_v18i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[20:21] offset:64
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v9i64_to_v18i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v18, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_mov_b32_e32 v16, s0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_store_b128 v18, v[0:3], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v18, v[4:7], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v18, v[8:11], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v18, v[12:15], s[4:5]
; GFX11-NEXT: global_store_b64 v18, v[16:17], s[4:5] offset:64
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <9 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <9 x i64> %phi_value to <18 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <18 x i32> [zeroinitializer, %entry], [%cast, %if]
store <18 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) {
; GCN-LABEL: bitcast_v10i64_to_v20i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: v_mov_b32_e32 v18, s6
; GCN-NEXT: v_mov_b32_e32 v19, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v10i64_to_v20i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s18, s4, 48
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s19, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s18
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s19
; VI-NEXT: s_add_u32 s14, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s15, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s14
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: s_add_u32 s14, s4, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s15, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s14
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: s_add_u32 s0, s4, 64
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v10i64_to_v20i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:64
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v10i64_to_v20i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_mov_b32_e32 v18, s0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5]
; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:64
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <10 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <10 x i64> %phi_value to <20 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <20 x i32> [zeroinitializer, %entry], [%cast, %if]
store <20 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) {
; GCN-LABEL: bitcast_v11i64_to_v20i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v11i64_to_v20i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s20, s4, 48
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s21, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s20
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s21
; VI-NEXT: s_add_u32 s16, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s17, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: s_add_u32 s10, s4, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_mov_b32 s6, s0
; VI-NEXT: s_mov_b32 s7, s0
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 0x50
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s4, 64
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v11i64_to_v20i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[24:25] offset:80
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:64
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v11i64_to_v20i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v21, s0
; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_mov_b32_e32 v18, s0
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5]
; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:80
; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:64
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <11 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <11 x i64> %phi_value to <22 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <22 x i32> [zeroinitializer, %entry], [%cast, %if]
store <22 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) {
; GCN-LABEL: bitcast_v12i64_to_v22i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: v_mov_b32_e32 v18, s6
; GCN-NEXT: v_mov_b32_e32 v19, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v12i64_to_v22i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s22, s4, 0x50
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s23, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s22
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s23
; VI-NEXT: s_add_u32 s18, s4, 64
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s19, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s18
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s19
; VI-NEXT: s_add_u32 s14, s4, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s15, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s14
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: s_add_u32 s10, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s11, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s10
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v12i64_to_v22i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[26:27], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:80
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:64
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v12i64_to_v22i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v21, s0
; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s0
; GFX11-NEXT: v_mov_b32_e32 v22, s0
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_store_b128 v24, v[0:3], s[4:5] offset:80
; GFX11-NEXT: global_store_b128 v24, v[4:7], s[4:5] offset:64
; GFX11-NEXT: global_store_b128 v24, v[8:11], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v24, v[12:15], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v24, v[16:19], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v24, v[20:23], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <12 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <12 x i64> %phi_value to <24 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <24 x i32> [zeroinitializer, %entry], [%cast, %if]
store <24 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) {
; GCN-LABEL: bitcast_v13i64_to_v24i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: v_mov_b32_e32 v18, s6
; GCN-NEXT: v_mov_b32_e32 v19, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:96
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v13i64_to_v24i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_add_u32 s24, s4, 0x50
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_addc_u32 s25, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s24
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s25
; VI-NEXT: s_add_u32 s20, s4, 64
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s21, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s20
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s21
; VI-NEXT: s_add_u32 s16, s4, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s17, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s17
; VI-NEXT: s_add_u32 s12, s4, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s13, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: s_add_u32 s6, s4, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s7, s5, 0
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: s_add_u32 s0, s4, 0x60
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v13i64_to_v24i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:80
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:64
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[28:29] offset:96
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v13i64_to_v24i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_mov_b32_e32 v18, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80
; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64
; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0
; GFX11-NEXT: v_mov_b32_e32 v5, s0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5]
; GFX11-NEXT: global_store_b64 v20, v[4:5], s[4:5] offset:96
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <13 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <13 x i64> %phi_value to <26 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <26 x i32> [zeroinitializer, %entry], [%cast, %if]
store <26 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) {
; GCN-LABEL: bitcast_v14i64_to_v26i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: v_mov_b32_e32 v18, s6
; GCN-NEXT: v_mov_b32_e32 v19, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v14i64_to_v26i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 0x50
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 64
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_add_u32 s0, s0, 0x60
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v14i64_to_v26i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:80
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:64
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:96
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v14i64_to_v26i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_mov_b32_e32 v18, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80
; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64
; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0
; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0
; GFX11-NEXT: v_mov_b32_e32 v7, s0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5]
; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:96
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <14 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <14 x i64> %phi_value to <28 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <28 x i32> [zeroinitializer, %entry], [%cast, %if]
store <28 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) {
; GCN-LABEL: bitcast_v15i64_to_v26i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: v_mov_b32_e32 v9, s6
; GCN-NEXT: v_mov_b32_e32 v10, s6
; GCN-NEXT: v_mov_b32_e32 v11, s6
; GCN-NEXT: v_mov_b32_e32 v12, s6
; GCN-NEXT: v_mov_b32_e32 v13, s6
; GCN-NEXT: v_mov_b32_e32 v14, s6
; GCN-NEXT: v_mov_b32_e32 v15, s6
; GCN-NEXT: v_mov_b32_e32 v16, s6
; GCN-NEXT: v_mov_b32_e32 v17, s6
; GCN-NEXT: v_mov_b32_e32 v18, s6
; GCN-NEXT: v_mov_b32_e32 v19, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NEXT: v_mov_b32_e32 v20, s6
; GCN-NEXT: v_mov_b32_e32 v21, s6
; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s6
; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[20:21], off, s[0:3], 0 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NEXT: s_endpgm
;
; VI-LABEL: bitcast_v15i64_to_v26i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[4:5], 0x24
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s2
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s0, 0x50
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 64
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 48
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 32
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_u32 s4, s0, 16
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_addc_u32 s5, s1, 0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_mov_b32 s15, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_add_u32 s2, s0, 0x70
; VI-NEXT: s_addc_u32 s3, s1, 0
; VI-NEXT: s_add_u32 s0, s0, 0x60
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: bitcast_v15i64_to_v26i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[34:35] offset:112
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: bitcast_v15i64_to_v26i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
; GFX11-NEXT: v_mov_b32_e32 v18, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:80
; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:64
; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v20, s0
; GFX11-NEXT: v_dual_mov_b32 v21, s0 :: v_dual_mov_b32 v4, s0
; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0
; GFX11-NEXT: v_mov_b32_e32 v7, s0
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:48
; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] offset:32
; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:16
; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5]
; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:112
; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:96
; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%phi_value = phi <15 x i64> [zeroinitializer, %entry], [%value, %if]
%cast = bitcast <15 x i64> %phi_value to <30 x i32>
%cmp1 = icmp eq i32 %cond, 1
br i1 %cmp1, label %if, label %end
end:
%phi_cast = phi <30 x i32> [zeroinitializer, %entry], [%cast, %if]
store <30 x i32> %phi_cast, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2bf16_to_i32(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v2bf16_to_i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB59_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
; GCN-NEXT: .LBB59_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2bf16_to_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2bf16_to_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2bf16_to_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x bfloat> %value to i32
br label %end
end:
%phi = phi i32 [0, %entry], [%cast, %if]
store i32 %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2bf16_to_v2i16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v2bf16_to_v2i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB60_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
; GCN-NEXT: .LBB60_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2bf16_to_v2i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2bf16_to_v2i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2bf16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x bfloat> %value to <2 x i16>
br label %end
end:
%phi = phi <2 x i16> [zeroinitializer, %entry], [%cast, %if]
store <2 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2bf16_to_v2f16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v2bf16_to_v2f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB61_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3
; GCN-NEXT: .LBB61_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2bf16_to_v2f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2bf16_to_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2bf16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x bfloat> %value to <2 x half>
br label %end
end:
%phi = phi <2 x half> [zeroinitializer, %entry], [%cast, %if]
store <2 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2bf16_to_v4i8(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v2bf16_to_v4i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB62_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
; GCN-NEXT: .LBB62_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2bf16_to_v4i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2bf16_to_v4i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2bf16_to_v4i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x bfloat> %value to <4 x i8>
br label %end
end:
%phi = phi <4 x i8> [zeroinitializer, %entry], [%cast, %if]
store <4 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v3bf16_to_v3i16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v3bf16_to_v3i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, v6
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB63_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v6, v4, v3, 16
; GCN-NEXT: .LBB63_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v3bf16_to_v3i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_short v[3:4], v6
; VI-NEXT: flat_store_dword v[1:2], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v3bf16_to_v3i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
; GFX9-NEXT: global_store_dword v[1:2], v5, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v3bf16_to_v3i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
; GFX11-NEXT: global_store_b32 v[1:2], v5, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <3 x bfloat> %value to <3 x i16>
br label %end
end:
%phi = phi <3 x i16> [zeroinitializer, %entry], [%cast, %if]
store <3 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v3bf16_to_v3f16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v3bf16_to_v3f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB64_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v4
; GCN-NEXT: .LBB64_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6
; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v3bf16_to_v3f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_short v[3:4], v6
; VI-NEXT: flat_store_dword v[1:2], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v3bf16_to_v3f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
; GFX9-NEXT: global_store_dword v[1:2], v5, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v3bf16_to_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
; GFX11-NEXT: global_store_b32 v[1:2], v5, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <3 x bfloat> %value to <3 x half>
br label %end
end:
%phi = phi <3 x half> [zeroinitializer, %entry], [%cast, %if]
store <3 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_i32_to_v2bf16(i32 %cond, ptr addrspace(1) %out, i32 %value) {
; GCN-LABEL: v_bitcast_i32_to_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB65_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: .LBB65_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_i32_to_v2bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_i32_to_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_i32_to_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast i32 %value to <2 x bfloat>
br label %end
end:
%phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <2 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2i16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x i16> %value) {
; GCN-LABEL: v_bitcast_v2i16_to_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB66_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GCN-NEXT: .LBB66_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2i16_to_v2bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2i16_to_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2i16_to_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x i16> %value to <2 x bfloat>
br label %end
end:
%phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <2 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2f16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x half> %value) {
; GCN-LABEL: v_bitcast_v2f16_to_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB67_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GCN-NEXT: .LBB67_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2f16_to_v2bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2f16_to_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2f16_to_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x half> %value to <2 x bfloat>
br label %end
end:
%phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <2 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4i8_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <4 x i8> %value) {
; GCN-LABEL: v_bitcast_v4i8_to_v2bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB68_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_or_b32_e32 v0, v3, v0
; GCN-NEXT: v_or_b32_e32 v7, v5, v4
; GCN-NEXT: .LBB68_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4i8_to_v2bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v7, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[1:2], v7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4i8_to_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4i8_to_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_cbranch_execz .LBB68_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: .LBB68_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x i8> %value to <2 x bfloat>
br label %end
end:
%phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <2 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v3i16_to_v3bf16(i32 %cond, ptr addrspace(1) %out, <3 x i16> %value) {
; GCN-LABEL: v_bitcast_v3i16_to_v3bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB69_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GCN-NEXT: .LBB69_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4
; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v3i16_to_v3bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_short v[3:4], v6
; VI-NEXT: flat_store_dword v[1:2], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v3i16_to_v3bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
; GFX9-NEXT: global_store_dword v[1:2], v5, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v3i16_to_v3bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
; GFX11-NEXT: global_store_b32 v[1:2], v5, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <3 x i16> %value to <3 x bfloat>
br label %end
end:
%phi = phi <3 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <3 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_v4f16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_v4f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB70_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5
; GCN-NEXT: .LBB70_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_or_b32_e32 v3, v0, v3
; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_v4f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_v4f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to <4 x half>
br label %end
end:
%phi = phi <4 x half> [zeroinitializer, %entry], [%cast, %if]
store <4 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_v4i16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_v4i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v8, v7
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB71_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB71_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_v4i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_v4i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_v4i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to <4 x i16>
br label %end
end:
%phi = phi <4 x i16> [zeroinitializer, %entry], [%cast, %if]
store <4 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_v2i32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_v2i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v8, v7
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB72_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB72_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_v2i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_v2i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to <2 x i32>
br label %end
end:
%phi = phi <2 x i32> [zeroinitializer, %entry], [%cast, %if]
store <2 x i32> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_v2f32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_v2f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v8, v7
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB73_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB73_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_v2f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to <2 x float>
br label %end
end:
%phi = phi <2 x float> [zeroinitializer, %entry], [%cast, %if]
store <2 x float> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_f64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB74_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB74_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to double
br label %end
end:
%phi = phi double [0.0, %entry], [%cast, %if]
store double %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_i64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB75_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB75_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to i64
br label %end
end:
%phi = phi i64 [0, %entry], [%cast, %if]
store i64 %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4bf16_to_v8i8(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v4bf16_to_v8i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v8, v7
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB76_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: .LBB76_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4bf16_to_v8i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4bf16_to_v8i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4bf16_to_v8i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x bfloat> %value to <8 x i8>
br label %end
end:
%phi = phi <8 x i8> [zeroinitializer, %entry], [%cast, %if]
store <8 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_i64_to_v4bf16(i32 %cond, ptr addrspace(1) %out, i64 %value) {
; GCN-LABEL: v_bitcast_i64_to_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB77_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GCN-NEXT: .LBB77_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_i64_to_v4bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_i64_to_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_i64_to_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast i64 %value to <4 x bfloat>
br label %end
end:
%phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <4 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2f32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x float> %value) {
; GCN-LABEL: v_bitcast_v2f32_to_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB78_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GCN-NEXT: .LBB78_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2f32_to_v4bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2f32_to_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2f32_to_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x float> %value to <4 x bfloat>
br label %end
end:
%phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <4 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2i32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x i32> %value) {
; GCN-LABEL: v_bitcast_v2i32_to_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB79_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GCN-NEXT: .LBB79_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2i32_to_v4bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2i32_to_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2i32_to_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x i32> %value to <4 x bfloat>
br label %end
end:
%phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <4 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4i16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x i16> %value) {
; GCN-LABEL: v_bitcast_v4i16_to_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB80_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GCN-NEXT: .LBB80_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4i16_to_v4bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4i16_to_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4i16_to_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x i16> %value to <4 x bfloat>
br label %end
end:
%phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <4 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4f16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x half> %value) {
; GCN-LABEL: v_bitcast_v4f16_to_v4bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB81_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5
; GCN-NEXT: .LBB81_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4f16_to_v4bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v6, v4
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4f16_to_v4bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v6, v5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v6, v4
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4f16_to_v4bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x half> %value to <4 x bfloat>
br label %end
end:
%phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <4 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v6bf16_to_v6i16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v6bf16_to_v6i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v10, v9
; GCN-NEXT: v_mov_b32_e32 v0, v9
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB82_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16
; GCN-NEXT: .LBB82_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v6bf16_to_v6i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mov_b32_e32 v8, v6
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v9, v6
; VI-NEXT: v_mov_b32_e32 v8, v5
; VI-NEXT: v_mov_b32_e32 v7, v4
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v6bf16_to_v6i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v6
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v9, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v4
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v6bf16_to_v6i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <6 x bfloat> %value to <6 x i16>
br label %end
end:
%phi = phi <6 x i16> [zeroinitializer, %entry], [%cast, %if]
store <6 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v6bf16_to_v6f16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v6bf16_to_v6f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB83_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v10, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7
; GCN-NEXT: .LBB83_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v3, v4, v3
; GCN-NEXT: v_or_b32_e32 v4, v6, v5
; GCN-NEXT: v_or_b32_e32 v0, v7, v0
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v6bf16_to_v6f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mov_b32_e32 v8, v6
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v9, v6
; VI-NEXT: v_mov_b32_e32 v8, v5
; VI-NEXT: v_mov_b32_e32 v7, v4
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v6bf16_to_v6f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v6
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v9, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v4
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v6bf16_to_v6f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <6 x bfloat> %value to <6 x half>
br label %end
end:
%phi = phi <6 x half> [zeroinitializer, %entry], [%cast, %if]
store <6 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v6bf16_to_v12i8(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v6bf16_to_v12i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v10, v9
; GCN-NEXT: v_mov_b32_e32 v0, v9
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB84_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16
; GCN-NEXT: .LBB84_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v6bf16_to_v12i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mov_b32_e32 v8, v6
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v8, v5
; VI-NEXT: v_mov_b32_e32 v7, v4
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v6bf16_to_v12i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v6
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v4
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v6bf16_to_v12i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <6 x bfloat> %value to <12 x i8>
br label %end
end:
%phi = phi <12 x i8> [zeroinitializer, %entry], [%cast, %if]
store <12 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v6f16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x half> %value) {
; GCN-LABEL: v_bitcast_v6f16_to_v6bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB85_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GCN-NEXT: .LBB85_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v6f16_to_v6bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mov_b32_e32 v8, v6
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v9, v6
; VI-NEXT: v_mov_b32_e32 v8, v5
; VI-NEXT: v_mov_b32_e32 v7, v4
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v6f16_to_v6bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v6
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v9, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v4
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v6f16_to_v6bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <6 x half> %value to <6 x bfloat>
br label %end
end:
%phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <6 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v6i16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x i16> %value) {
; GCN-LABEL: v_bitcast_v6i16_to_v6bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB86_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GCN-NEXT: .LBB86_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v6i16_to_v6bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mov_b32_e32 v8, v6
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v9, v6
; VI-NEXT: v_mov_b32_e32 v8, v5
; VI-NEXT: v_mov_b32_e32 v7, v4
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v6i16_to_v6bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v6
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v9, v6
; GFX9-NEXT: v_mov_b32_e32 v8, v5
; GFX9-NEXT: v_mov_b32_e32 v7, v4
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v6i16_to_v6bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v7, v6
; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <6 x i16> %value to <6 x bfloat>
br label %end
end:
%phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <6 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v12i8_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <12 x i8> %value) {
; GCN-LABEL: v_bitcast_v12i8_to_v6bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB87_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_or_b32_e32 v6, v6, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GCN-NEXT: v_or_b32_e32 v16, v3, v0
; GCN-NEXT: v_or_b32_e32 v18, v5, v4
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v6
; GCN-NEXT: v_or_b32_e32 v19, v9, v7
; GCN-NEXT: v_or_b32_e32 v15, v11, v8
; GCN-NEXT: v_or_b32_e32 v0, v13, v10
; GCN-NEXT: .LBB87_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v15
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v12i8_to_v6bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v15, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v16, v15
; VI-NEXT: v_mov_b32_e32 v17, v15
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB87_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: .LBB87_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx3 v[1:2], v[15:17]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v12i8_to_v6bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v16, v15
; GFX9-NEXT: v_mov_b32_e32 v17, v15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB87_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: .LBB87_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx3 v[1:2], v[15:17], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v12i8_to_v6bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v15, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v16, v15
; GFX11-NEXT: v_mov_b32_e32 v17, v15
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB87_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-NEXT: v_or_b32_e32 v5, v7, v8
; GFX11-NEXT: v_or_b32_e32 v6, v9, v10
; GFX11-NEXT: v_or_b32_e32 v7, v11, v12
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_or_b32_e32 v15, v0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v16, v3, v5
; GFX11-NEXT: v_or_b32_e32 v17, v6, v7
; GFX11-NEXT: .LBB87_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b96 v[1:2], v[15:17], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <12 x i8> %value to <6 x bfloat>
br label %end
end:
%phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <6 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v2f64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v2f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v12, v11
; GCN-NEXT: v_mov_b32_e32 v13, v11
; GCN-NEXT: v_mov_b32_e32 v14, v11
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB88_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
; GCN-NEXT: .LBB88_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v2f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v2f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <2 x double>
br label %end
end:
%phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
store <2 x double> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v2i64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v2i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v12, v11
; GCN-NEXT: v_mov_b32_e32 v13, v11
; GCN-NEXT: v_mov_b32_e32 v14, v11
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB89_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
; GCN-NEXT: .LBB89_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v2i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v2i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <2 x i64>
br label %end
end:
%phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
store <2 x i64> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v4f32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v4f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v12, v11
; GCN-NEXT: v_mov_b32_e32 v13, v11
; GCN-NEXT: v_mov_b32_e32 v14, v11
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB90_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
; GCN-NEXT: .LBB90_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v4f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v4f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <4 x float>
br label %end
end:
%phi = phi <4 x float> [zeroinitializer, %entry], [%cast, %if]
store <4 x float> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v4i32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v4i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v12, v11
; GCN-NEXT: v_mov_b32_e32 v13, v11
; GCN-NEXT: v_mov_b32_e32 v14, v11
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB91_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
; GCN-NEXT: .LBB91_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v4i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v4i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v4i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <4 x i32>
br label %end
end:
%phi = phi <4 x i32> [zeroinitializer, %entry], [%cast, %if]
store <4 x i32> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v8f16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v8f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB92_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v14, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9
; GCN-NEXT: .LBB92_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v14
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v3, v4, v3
; GCN-NEXT: v_or_b32_e32 v4, v6, v5
; GCN-NEXT: v_or_b32_e32 v5, v8, v7
; GCN-NEXT: v_or_b32_e32 v6, v9, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v8f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v8f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v8f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <8 x half>
br label %end
end:
%phi = phi <8 x half> [zeroinitializer, %entry], [%cast, %if]
store <8 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8bf16_to_v8i16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v8bf16_to_v8i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v12, v11
; GCN-NEXT: v_mov_b32_e32 v13, v11
; GCN-NEXT: v_mov_b32_e32 v14, v11
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB93_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
; GCN-NEXT: .LBB93_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8bf16_to_v8i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8bf16_to_v8i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8bf16_to_v8i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x bfloat> %value to <8 x i16>
br label %end
end:
%phi = phi <8 x i16> [zeroinitializer, %entry], [%cast, %if]
store <8 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8f16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x half> %value) {
; GCN-LABEL: v_bitcast_v8f16_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB94_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9
; GCN-NEXT: .LBB94_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8f16_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8f16_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8f16_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x half> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8i16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x i16> %value) {
; GCN-LABEL: v_bitcast_v8i16_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB95_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10
; GCN-NEXT: .LBB95_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8i16_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8i16_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8i16_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x i16> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16i8_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <16 x i8> %value) {
; GCN-LABEL: v_bitcast_v16i8_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB96_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15
; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17
; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_or_b32_e32 v6, v6, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GCN-NEXT: v_or_b32_e32 v12, v14, v15
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16
; GCN-NEXT: v_or_b32_e32 v19, v3, v0
; GCN-NEXT: v_or_b32_e32 v23, v5, v4
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v6
; GCN-NEXT: v_or_b32_e32 v24, v9, v7
; GCN-NEXT: v_or_b32_e32 v21, v11, v8
; GCN-NEXT: v_or_b32_e32 v25, v13, v10
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v12
; GCN-NEXT: v_or_b32_e32 v0, v17, v14
; GCN-NEXT: .LBB96_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v22
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16i8_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB96_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18
; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: .LBB96_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16i8_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB96_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: .LBB96_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16i8_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB96_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v12
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v9
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_lshlrev_b16 v6, 8, v10
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v11
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v14
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v15
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v16
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v17
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v18
; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
; GFX11-NEXT: v_or_b32_e32 v6, v7, v8
; GFX11-NEXT: v_or_b32_e32 v7, v9, v10
; GFX11-NEXT: v_or_b32_e32 v8, v11, v12
; GFX11-NEXT: v_or_b32_e32 v9, v13, v14
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: v_or_b32_e32 v19, v0, v3
; GFX11-NEXT: v_or_b32_e32 v20, v4, v5
; GFX11-NEXT: v_or_b32_e32 v21, v6, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v22, v8, v9
; GFX11-NEXT: .LBB96_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x i8> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2i64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
; GCN-LABEL: v_bitcast_v2i64_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB97_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: .LBB97_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2i64_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2i64_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2i64_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x i64> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v2f64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
; GCN-LABEL: v_bitcast_v2f64_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB98_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: .LBB98_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v2f64_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v2f64_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v2f64_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <2 x double> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4i32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x i32> %value) {
; GCN-LABEL: v_bitcast_v4i32_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB99_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: .LBB99_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4i32_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4i32_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4i32_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x i32> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4f32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x float> %value) {
; GCN-LABEL: v_bitcast_v4f32_to_v8bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v7, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB100_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GCN-NEXT: .LBB100_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4f32_to_v8bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v7, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v10, v7
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v10, v6
; VI-NEXT: v_mov_b32_e32 v9, v5
; VI-NEXT: v_mov_b32_e32 v8, v4
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4f32_to_v8bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v8, v7
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mov_b32_e32 v10, v7
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v10, v6
; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v8, v4
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4f32_to_v8bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v7, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x float> %value to <8 x bfloat>
br label %end
end:
%phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <8 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v16i16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v16i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB101_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB101_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v16i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v16i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v16i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <16 x i16>
br label %end
end:
%phi = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if]
store <16 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v16f16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v16f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB102_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v32, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v33, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v16
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17
; GCN-NEXT: .LBB102_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v31
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v32
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v33
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v29
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v22
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v3, v4, v3
; GCN-NEXT: v_or_b32_e32 v4, v6, v5
; GCN-NEXT: v_or_b32_e32 v5, v8, v7
; GCN-NEXT: v_or_b32_e32 v6, v10, v9
; GCN-NEXT: v_or_b32_e32 v7, v12, v11
; GCN-NEXT: v_or_b32_e32 v8, v14, v13
; GCN-NEXT: v_or_b32_e32 v9, v16, v15
; GCN-NEXT: v_or_b32_e32 v10, v17, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v16f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v16f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v16f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <16 x half>
br label %end
end:
%phi = phi <16 x half> [zeroinitializer, %entry], [%cast, %if]
store <16 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v8i32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v8i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB103_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB103_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v8i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v8i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <8 x i32>
br label %end
end:
%phi = phi <8 x i32> [zeroinitializer, %entry], [%cast, %if]
store <8 x i32> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v8f32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v8f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB104_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB104_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v8f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v8f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v8f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <8 x float>
br label %end
end:
%phi = phi <8 x float> [zeroinitializer, %entry], [%cast, %if]
store <8 x float> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v4f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB105_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB105_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v4f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v4f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <4 x double>
br label %end
end:
%phi = phi <4 x double> [zeroinitializer, %entry], [%cast, %if]
store <4 x double> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v4i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB106_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB106_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v4i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v4i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v4i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <4 x i64>
br label %end
end:
%phi = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if]
store <4 x i64> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16bf16_to_v32i8(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v16bf16_to_v32i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v20, v19
; GCN-NEXT: v_mov_b32_e32 v21, v19
; GCN-NEXT: v_mov_b32_e32 v22, v19
; GCN-NEXT: v_mov_b32_e32 v23, v19
; GCN-NEXT: v_mov_b32_e32 v24, v19
; GCN-NEXT: v_mov_b32_e32 v25, v19
; GCN-NEXT: v_mov_b32_e32 v26, v19
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB107_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
; GCN-NEXT: .LBB107_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16bf16_to_v32i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16bf16_to_v32i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16bf16_to_v32i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x bfloat> %value to <32 x i8>
br label %end
end:
%phi = phi <32 x i8> [zeroinitializer, %entry], [%cast, %if]
store <32 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8f32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x float> %value) {
; GCN-LABEL: v_bitcast_v8f32_to_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB108_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
; GCN-NEXT: .LBB108_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8f32_to_v16bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8f32_to_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8f32_to_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x float> %value to <16 x bfloat>
br label %end
end:
%phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <16 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8i32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x i32> %value) {
; GCN-LABEL: v_bitcast_v8i32_to_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB109_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
; GCN-NEXT: .LBB109_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8i32_to_v16bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8i32_to_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8i32_to_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x i32> %value to <16 x bfloat>
br label %end
end:
%phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <16 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4i64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
; GCN-LABEL: v_bitcast_v4i64_to_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB110_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
; GCN-NEXT: .LBB110_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4i64_to_v16bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4i64_to_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4i64_to_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x i64> %value to <16 x bfloat>
br label %end
end:
%phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <16 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v4f64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
; GCN-LABEL: v_bitcast_v4f64_to_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v15, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB111_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
; GCN-NEXT: .LBB111_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v4f64_to_v16bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v12, v11
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mov_b32_e32 v14, v11
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v16, v11
; VI-NEXT: v_mov_b32_e32 v17, v11
; VI-NEXT: v_mov_b32_e32 v18, v11
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v18, v10
; VI-NEXT: v_mov_b32_e32 v17, v9
; VI-NEXT: v_mov_b32_e32 v16, v8
; VI-NEXT: v_mov_b32_e32 v15, v7
; VI-NEXT: v_mov_b32_e32 v14, v6
; VI-NEXT: v_mov_b32_e32 v13, v5
; VI-NEXT: v_mov_b32_e32 v12, v4
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: ; %bb.2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v4f64_to_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v12, v11
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v11
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v16, v11
; GFX9-NEXT: v_mov_b32_e32 v17, v11
; GFX9-NEXT: v_mov_b32_e32 v18, v11
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: v_mov_b32_e32 v17, v9
; GFX9-NEXT: v_mov_b32_e32 v16, v8
; GFX9-NEXT: v_mov_b32_e32 v15, v7
; GFX9-NEXT: v_mov_b32_e32 v14, v6
; GFX9-NEXT: v_mov_b32_e32 v13, v5
; GFX9-NEXT: v_mov_b32_e32 v12, v4
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v4f64_to_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v11, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v12, v11
; GFX11-NEXT: v_mov_b32_e32 v13, v11
; GFX11-NEXT: v_mov_b32_e32 v14, v11
; GFX11-NEXT: v_mov_b32_e32 v15, v11
; GFX11-NEXT: v_mov_b32_e32 v16, v11
; GFX11-NEXT: v_mov_b32_e32 v17, v11
; GFX11-NEXT: v_mov_b32_e32 v18, v11
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
; GFX11-NEXT: ; %bb.2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <4 x double> %value to <16 x bfloat>
br label %end
end:
%phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <16 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32i8_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <32 x i8> %value) {
; GCN-LABEL: v_bitcast_v32i8_to_v16bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB112_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15
; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17
; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19
; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xff, v21
; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xff, v23
; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xff, v25
; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xff, v27
; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xff, v29
; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v30, 0xff, v38
; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v37
; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32
; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v31
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_or_b32_e32 v6, v6, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GCN-NEXT: v_or_b32_e32 v12, v14, v15
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v18
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
; GCN-NEXT: v_or_b32_e32 v18, v22, v23
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v28
; GCN-NEXT: v_or_b32_e32 v24, v30, v33
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32
; GCN-NEXT: v_or_b32_e32 v50, v3, v0
; GCN-NEXT: v_or_b32_e32 v54, v5, v4
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v6
; GCN-NEXT: v_or_b32_e32 v55, v9, v7
; GCN-NEXT: v_or_b32_e32 v52, v11, v8
; GCN-NEXT: v_or_b32_e32 v40, v13, v10
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v12
; GCN-NEXT: v_or_b32_e32 v41, v17, v14
; GCN-NEXT: v_or_b32_e32 v33, v19, v15
; GCN-NEXT: v_or_b32_e32 v39, v21, v16
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18
; GCN-NEXT: v_or_b32_e32 v48, v25, v20
; GCN-NEXT: v_or_b32_e32 v35, v27, v22
; GCN-NEXT: v_or_b32_e32 v49, v29, v23
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v24
; GCN-NEXT: v_or_b32_e32 v0, v31, v26
; GCN-NEXT: .LBB112_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v53
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v0, v17, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32i8_to_v16bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v31, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v32, v31
; VI-NEXT: v_mov_b32_e32 v33, v31
; VI-NEXT: v_mov_b32_e32 v34, v31
; VI-NEXT: v_mov_b32_e32 v35, v31
; VI-NEXT: v_mov_b32_e32 v36, v31
; VI-NEXT: v_mov_b32_e32 v37, v31
; VI-NEXT: v_mov_b32_e32 v38, v31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB112_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v32, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v33, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18
; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v34, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v22
; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v35, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v26
; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v36, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v30
; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v37, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v48
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v38, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: .LBB112_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32i8_to_v16bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v31, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v32, v31
; GFX9-NEXT: v_mov_b32_e32 v33, v31
; GFX9-NEXT: v_mov_b32_e32 v34, v31
; GFX9-NEXT: v_mov_b32_e32 v35, v31
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v37, v31
; GFX9-NEXT: v_mov_b32_e32 v38, v31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB112_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_perm_b32 v31, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v32, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v33, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18
; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v34, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v22
; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v35, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v26
; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v36, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v30
; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v37, v3, v0, s6
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v48
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v38, v3, v0, s6
; GFX9-NEXT: .LBB112_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32i8_to_v16bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:12
; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v50, off, s32
; GFX11-NEXT: v_mov_b32_e32 v31, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v32, v31
; GFX11-NEXT: v_mov_b32_e32 v33, v31
; GFX11-NEXT: v_mov_b32_e32 v34, v31
; GFX11-NEXT: v_mov_b32_e32 v35, v31
; GFX11-NEXT: v_mov_b32_e32 v36, v31
; GFX11-NEXT: v_mov_b32_e32 v37, v31
; GFX11-NEXT: v_mov_b32_e32 v38, v31
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB112_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
; GFX11-NEXT: v_or_b32_e32 v5, v7, v8
; GFX11-NEXT: v_or_b32_e32 v6, v9, v10
; GFX11-NEXT: v_or_b32_e32 v7, v11, v12
; GFX11-NEXT: v_perm_b32 v31, v4, v0, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v15
; GFX11-NEXT: v_perm_b32 v32, v5, v3, 0x5040100
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v16
; GFX11-NEXT: v_perm_b32 v33, v7, v6, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v17
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v18
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v19
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v20
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v21
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v22
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v23
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v24
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v26
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v27
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v28
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v29
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v30
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v50
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v49
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v34, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v35, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v36, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v37, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v38, v11, v10, 0x5040100
; GFX11-NEXT: .LBB112_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x i8> %value to <16 x bfloat>
br label %end
end:
%phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <16 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v8i64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v8i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB113_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB113_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v8i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB113_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB113_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v8i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB113_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB113_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v8i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB113_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB113_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <8 x i64>
br label %end
end:
%phi = phi <8 x i64> [zeroinitializer, %entry], [%cast, %if]
store <8 x i64> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v8f64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v8f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB114_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB114_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v8f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB114_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB114_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v8f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB114_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB114_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v8f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB114_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB114_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <8 x double>
br label %end
end:
%phi = phi <8 x double> [zeroinitializer, %entry], [%cast, %if]
store <8 x double> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v16i32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v16i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB115_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB115_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v16i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB115_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB115_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v16i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB115_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB115_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v16i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB115_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB115_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <16 x i32>
br label %end
end:
%phi = phi <16 x i32> [zeroinitializer, %entry], [%cast, %if]
store <16 x i32> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v16f32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v16f32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB116_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB116_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v16f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB116_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB116_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v16f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB116_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB116_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v16f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB116_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB116_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <16 x float>
br label %end
end:
%phi = phi <16 x float> [zeroinitializer, %entry], [%cast, %if]
store <16 x float> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v32f16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v32f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v60, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v61, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v62, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v63, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB117_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v59
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v32
; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_cvt_f32_f16_e32 v46, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v61, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v62, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v58, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v63, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v42, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v43, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v44, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16
; GCN-NEXT: v_cvt_f32_f16_e32 v45, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v38, v18
; GCN-NEXT: v_cvt_f32_f16_e32 v50, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v39, v20
; GCN-NEXT: v_cvt_f32_f16_e32 v51, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v48, v22
; GCN-NEXT: v_cvt_f32_f16_e32 v52, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24
; GCN-NEXT: v_cvt_f32_f16_e32 v53, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v26
; GCN-NEXT: v_cvt_f32_f16_e32 v35, v27
; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28
; GCN-NEXT: v_cvt_f32_f16_e32 v36, v34
; GCN-NEXT: v_cvt_f32_f16_e32 v33, v30
; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37
; GCN-NEXT: v_cvt_f32_f16_e32 v34, v47
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29
; GCN-NEXT: .LBB117_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v46
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v61
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v62
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v63
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v42
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v54
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v43
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v44
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v45
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v41
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v50
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v51
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v52
; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v53
; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49
; GCN-NEXT: v_cvt_f16_f32_e32 v27, v35
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v31
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v36
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v30, v32
; GCN-NEXT: v_cvt_f16_f32_e32 v31, v37
; GCN-NEXT: v_cvt_f16_f32_e32 v32, v33
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v34
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_or_b32_e32 v3, v4, v3
; GCN-NEXT: v_or_b32_e32 v4, v6, v5
; GCN-NEXT: v_or_b32_e32 v5, v8, v7
; GCN-NEXT: v_or_b32_e32 v6, v10, v9
; GCN-NEXT: v_or_b32_e32 v7, v12, v11
; GCN-NEXT: v_or_b32_e32 v8, v14, v13
; GCN-NEXT: v_or_b32_e32 v9, v16, v15
; GCN-NEXT: v_or_b32_e32 v10, v18, v17
; GCN-NEXT: v_or_b32_e32 v11, v20, v19
; GCN-NEXT: v_or_b32_e32 v12, v22, v21
; GCN-NEXT: v_or_b32_e32 v13, v24, v23
; GCN-NEXT: v_or_b32_e32 v14, v26, v25
; GCN-NEXT: v_or_b32_e32 v15, v28, v27
; GCN-NEXT: v_or_b32_e32 v16, v30, v29
; GCN-NEXT: v_or_b32_e32 v17, v32, v31
; GCN-NEXT: v_or_b32_e32 v18, v33, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v32f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB117_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB117_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v32f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB117_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB117_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v32f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB117_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB117_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <32 x half>
br label %end
end:
%phi = phi <32 x half> [zeroinitializer, %entry], [%cast, %if]
store <32 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v32i16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v32i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB118_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB118_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v32i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB118_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB118_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v32i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB118_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB118_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v32i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB118_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB118_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <32 x i16>
br label %end
end:
%phi = phi <32 x i16> [zeroinitializer, %entry], [%cast, %if]
store <32 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32bf16_to_v64i8(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v32bf16_to_v64i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB119_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
; GCN-NEXT: .LBB119_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32bf16_to_v64i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB119_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB119_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32bf16_to_v64i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB119_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB119_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32bf16_to_v64i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB119_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB119_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x bfloat> %value to <64 x i8>
br label %end
end:
%phi = phi <64 x i8> [zeroinitializer, %entry], [%cast, %if]
store <64 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64i8_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <64 x i8> %value) {
; GCN-LABEL: v_bitcast_v64i8_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v63, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB120_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
; GCN-NEXT: v_or_b32_e32 v31, v0, v7
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15
; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v16
; GCN-NEXT: v_or_b32_e32 v0, v0, v7
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v7, 0xff, v23
; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_or_b32_e32 v0, v7, v8
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_and_b32_e32 v8, 0xff, v11
; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v44
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_or_b32_e32 v0, v8, v11
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v11, 0xff, v3
; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4
; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5
; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v6
; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9
; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v10
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v22, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v27, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v28, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v29, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v30, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v32, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v49, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0
; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12
; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v43
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v42
; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v17
; GCN-NEXT: v_and_b32_e32 v51, 0xff, v62
; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v61
; GCN-NEXT: v_and_b32_e32 v55, 0xff, v60
; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v59
; GCN-NEXT: v_and_b32_e32 v40, 0xff, v58
; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57
; GCN-NEXT: v_and_b32_e32 v41, 0xff, v56
; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v47
; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46
; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v45
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v47, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v56, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v58, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v59, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v61, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v42, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v62, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v63, 0xff, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v34, 0xff, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v26
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v28
; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v30
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v32
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v49
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14
; GCN-NEXT: v_or_b32_e32 v12, v51, v53
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v55
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v40
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41
; GCN-NEXT: v_or_b32_e32 v45, v46, v45
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v58
; GCN-NEXT: v_or_b32_e32 v58, v59, v60
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61
; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v62
; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v63
; GCN-NEXT: v_or_b32_e32 v0, v0, v33
; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v34
; GCN-NEXT: v_or_b32_e32 v50, v50, v3
; GCN-NEXT: v_or_b32_e32 v52, v52, v18
; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_or_b32_e32 v48, v48, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or_b32_e32 v18, v3, v36
; GCN-NEXT: v_or_b32_e32 v40, v7, v37
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
; GCN-NEXT: v_or_b32_e32 v41, v8, v11
; GCN-NEXT: v_or_b32_e32 v22, v6, v20
; GCN-NEXT: v_or_b32_e32 v20, v9, v35
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3
; GCN-NEXT: v_or_b32_e32 v53, v10, v29
; GCN-NEXT: v_or_b32_e32 v21, v21, v30
; GCN-NEXT: v_or_b32_e32 v19, v19, v32
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3
; GCN-NEXT: v_or_b32_e32 v54, v54, v26
; GCN-NEXT: v_or_b32_e32 v35, v25, v27
; GCN-NEXT: v_or_b32_e32 v37, v15, v28
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v12
; GCN-NEXT: v_or_b32_e32 v25, v16, v13
; GCN-NEXT: v_or_b32_e32 v36, v57, v14
; GCN-NEXT: v_or_b32_e32 v38, v38, v5
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v45
; GCN-NEXT: v_or_b32_e32 v39, v39, v46
; GCN-NEXT: v_or_b32_e32 v63, v44, v47
; GCN-NEXT: v_or_b32_e32 v29, v43, v56
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v58
; GCN-NEXT: v_or_b32_e32 v34, v42, v59
; GCN-NEXT: v_or_b32_e32 v30, v23, v60
; GCN-NEXT: v_or_b32_e32 v28, v24, v61
; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0
; GCN-NEXT: v_or_b32_e32 v0, v17, v62
; GCN-NEXT: .LBB120_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v55
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v36
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v63
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16
; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16
; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16
; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v27, v29, 16
; GCN-NEXT: v_alignbit_b32 v16, v31, v32, 16
; GCN-NEXT: v_alignbit_b32 v17, v28, v30, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64i8_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136
; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128
; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120
; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104
; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96
; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92
; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88
; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84
; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72
; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64
; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56
; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52
; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48
; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36
; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32
; VI-NEXT: v_mov_b32_e32 v31, 0
; VI-NEXT: v_mov_b32_e32 v32, v31
; VI-NEXT: v_mov_b32_e32 v33, v31
; VI-NEXT: v_mov_b32_e32 v34, v31
; VI-NEXT: v_mov_b32_e32 v35, v31
; VI-NEXT: v_mov_b32_e32 v36, v31
; VI-NEXT: v_mov_b32_e32 v37, v31
; VI-NEXT: v_mov_b32_e32 v38, v31
; VI-NEXT: v_mov_b32_e32 v48, v31
; VI-NEXT: v_mov_b32_e32 v49, v31
; VI-NEXT: v_mov_b32_e32 v50, v31
; VI-NEXT: v_mov_b32_e32 v51, v31
; VI-NEXT: v_mov_b32_e32 v52, v31
; VI-NEXT: v_mov_b32_e32 v53, v31
; VI-NEXT: v_mov_b32_e32 v54, v31
; VI-NEXT: v_mov_b32_e32 v55, v31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB120_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v6
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v8
; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v10
; VI-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v31, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v32, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v12
; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v33, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v34, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v35, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v36, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v37, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v13
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v15
; VI-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v38, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v17
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19
; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v48, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v21
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v23
; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v49, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v25
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v27
; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v50, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v29
; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63
; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v61
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v59
; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v47
; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v43
; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v39
; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: .LBB120_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[52:55]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[48:51]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64i8_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136
; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128
; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120
; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104
; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96
; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88
; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84
; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80
; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64
; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56
; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40
; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36
; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32
; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28
; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32
; GFX9-NEXT: v_mov_b32_e32 v31, 0
; GFX9-NEXT: v_mov_b32_e32 v32, v31
; GFX9-NEXT: v_mov_b32_e32 v33, v31
; GFX9-NEXT: v_mov_b32_e32 v34, v31
; GFX9-NEXT: v_mov_b32_e32 v35, v31
; GFX9-NEXT: v_mov_b32_e32 v36, v31
; GFX9-NEXT: v_mov_b32_e32 v37, v31
; GFX9-NEXT: v_mov_b32_e32 v38, v31
; GFX9-NEXT: v_mov_b32_e32 v48, v31
; GFX9-NEXT: v_mov_b32_e32 v49, v31
; GFX9-NEXT: v_mov_b32_e32 v50, v31
; GFX9-NEXT: v_mov_b32_e32 v51, v31
; GFX9-NEXT: v_mov_b32_e32 v52, v31
; GFX9-NEXT: v_mov_b32_e32 v53, v31
; GFX9-NEXT: v_mov_b32_e32 v54, v31
; GFX9-NEXT: v_mov_b32_e32 v55, v31
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB120_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v6
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v8
; GFX9-NEXT: v_lshlrev_b16_e32 v6, 8, v10
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v31, v4, v3, s6
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-NEXT: v_perm_b32 v32, v6, v5, s6
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v12
; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v33, v4, v3, s6
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v34, v4, v3, s6
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v35, v4, v3, s6
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v36, v4, v3, s6
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v37, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v13
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v15
; GFX9-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v38, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v17
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19
; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v48, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v21
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v23
; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v49, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v25
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v27
; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v50, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v29
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63
; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v61
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v59
; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v47
; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v43
; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v39
; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6
; GFX9-NEXT: .LBB120_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[52:55], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[48:51], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64i8_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:140
; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:136
; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:132
; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:128
; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124
; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:120
; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:116
; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:112
; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:108
; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:104
; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:100
; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:96
; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:92
; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:88
; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:84
; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:80
; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:76
; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:72
; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68
; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:64
; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:60
; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:56
; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:52
; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:48
; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:44
; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:40
; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:36
; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:32
; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:28
; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:24
; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:20
; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:12
; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v130, off, s32
; GFX11-NEXT: v_mov_b32_e32 v31, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v32, v31
; GFX11-NEXT: v_mov_b32_e32 v33, v31
; GFX11-NEXT: v_mov_b32_e32 v34, v31
; GFX11-NEXT: v_mov_b32_e32 v35, v31
; GFX11-NEXT: v_mov_b32_e32 v36, v31
; GFX11-NEXT: v_mov_b32_e32 v37, v31
; GFX11-NEXT: v_mov_b32_e32 v38, v31
; GFX11-NEXT: v_mov_b32_e32 v48, v31
; GFX11-NEXT: v_mov_b32_e32 v49, v31
; GFX11-NEXT: v_mov_b32_e32 v50, v31
; GFX11-NEXT: v_mov_b32_e32 v51, v31
; GFX11-NEXT: v_mov_b32_e32 v52, v31
; GFX11-NEXT: v_mov_b32_e32 v53, v31
; GFX11-NEXT: v_mov_b32_e32 v54, v31
; GFX11-NEXT: v_mov_b32_e32 v55, v31
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB120_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v14
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v24
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v8
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v10
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v13
; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v15
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v16
; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-NEXT: v_or_b32_e32 v5, v6, v7
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v17
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v18
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v19
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v20
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v21
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v22
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v23
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v25
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v34, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v35, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v36, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v27
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v28
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v29
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v30
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v130
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v129
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v128
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v119
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v118
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v117
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v116
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v115
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v114
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v113
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v112
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v103
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v102
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v101
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v100
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v99
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v37, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v38, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v48, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v49, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v50, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v98
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v97
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v96
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v87
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v86
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v85
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v84
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v83
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v81
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v80
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v71
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v70
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v69
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v67
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v66
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v65
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v64
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v51, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v52, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v53, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v54, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v55, v11, v10, 0x5040100
; GFX11-NEXT: .LBB120_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[52:55], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[48:51], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x i8> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32i16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x i16> %value) {
; GCN-LABEL: v_bitcast_v32i16_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt expcnt(3)
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v60, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v61, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB121_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v18
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v19
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20
; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v26
; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v28
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30
; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v62
; GCN-NEXT: .LBB121_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v46
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v60
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v61
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v57
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v42
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v43
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v44
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v45
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v36
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v7, 16
; GCN-NEXT: v_alignbit_b32 v6, v8, v9, 16
; GCN-NEXT: v_alignbit_b32 v7, v10, v11, 16
; GCN-NEXT: v_alignbit_b32 v8, v12, v13, 16
; GCN-NEXT: v_alignbit_b32 v9, v14, v15, 16
; GCN-NEXT: v_alignbit_b32 v10, v16, v17, 16
; GCN-NEXT: v_alignbit_b32 v11, v18, v19, 16
; GCN-NEXT: v_alignbit_b32 v12, v20, v21, 16
; GCN-NEXT: v_alignbit_b32 v13, v22, v23, 16
; GCN-NEXT: v_alignbit_b32 v14, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v15, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v16, v28, v30, 16
; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16
; GCN-NEXT: v_alignbit_b32 v18, v29, v33, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32i16_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB121_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB121_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32i16_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB121_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB121_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32i16_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB121_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB121_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x i16> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32f16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x half> %value) {
; GCN-LABEL: v_bitcast_v32f16_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v60, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v61, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB122_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v24, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v26, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v31
; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62
; GCN-NEXT: v_cvt_f16_f32_e32 v62, v63
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v19
; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v20
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v26
; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v28
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v33
; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30
; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v34
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v62
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
; GCN-NEXT: .LBB122_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v60
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v56
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v61
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v57
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v42
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v43
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v44
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v45
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v50
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v36
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16
; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16
; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16
; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v27, v28, 16
; GCN-NEXT: v_alignbit_b32 v16, v29, v30, 16
; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32f16_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB122_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB122_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32f16_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB122_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB122_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32f16_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB122_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB122_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x half> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16i32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x i32> %value) {
; GCN-LABEL: v_bitcast_v16i32_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB123_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
; GCN-NEXT: .LBB123_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16i32_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB123_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB123_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16i32_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB123_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB123_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16i32_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB123_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB123_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x i32> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v16f32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x float> %value) {
; GCN-LABEL: v_bitcast_v16f32_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB124_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
; GCN-NEXT: .LBB124_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v16f32_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB124_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB124_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v16f32_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB124_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB124_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v16f32_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB124_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB124_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <16 x float> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8f64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x double> %value) {
; GCN-LABEL: v_bitcast_v8f64_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB125_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
; GCN-NEXT: .LBB125_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8f64_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB125_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB125_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8f64_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB125_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB125_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8f64_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB125_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB125_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x double> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v8i64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x i64> %value) {
; GCN-LABEL: v_bitcast_v8i64_to_v32bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v29, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB126_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
; GCN-NEXT: .LBB126_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8i64_to_v32bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v19, 0
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v20, v19
; VI-NEXT: v_mov_b32_e32 v21, v19
; VI-NEXT: v_mov_b32_e32 v22, v19
; VI-NEXT: v_mov_b32_e32 v23, v19
; VI-NEXT: v_mov_b32_e32 v24, v19
; VI-NEXT: v_mov_b32_e32 v25, v19
; VI-NEXT: v_mov_b32_e32 v26, v19
; VI-NEXT: v_mov_b32_e32 v27, v19
; VI-NEXT: v_mov_b32_e32 v28, v19
; VI-NEXT: v_mov_b32_e32 v29, v19
; VI-NEXT: v_mov_b32_e32 v30, v19
; VI-NEXT: v_mov_b32_e32 v31, v19
; VI-NEXT: v_mov_b32_e32 v32, v19
; VI-NEXT: v_mov_b32_e32 v33, v19
; VI-NEXT: v_mov_b32_e32 v34, v19
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB126_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v34, v18
; VI-NEXT: v_mov_b32_e32 v33, v17
; VI-NEXT: v_mov_b32_e32 v32, v16
; VI-NEXT: v_mov_b32_e32 v31, v15
; VI-NEXT: v_mov_b32_e32 v30, v14
; VI-NEXT: v_mov_b32_e32 v29, v13
; VI-NEXT: v_mov_b32_e32 v28, v12
; VI-NEXT: v_mov_b32_e32 v27, v11
; VI-NEXT: v_mov_b32_e32 v26, v10
; VI-NEXT: v_mov_b32_e32 v25, v9
; VI-NEXT: v_mov_b32_e32 v24, v8
; VI-NEXT: v_mov_b32_e32 v23, v7
; VI-NEXT: v_mov_b32_e32 v22, v6
; VI-NEXT: v_mov_b32_e32 v21, v5
; VI-NEXT: v_mov_b32_e32 v20, v4
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: .LBB126_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8i64_to_v32bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v19, 0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_mov_b32_e32 v20, v19
; GFX9-NEXT: v_mov_b32_e32 v21, v19
; GFX9-NEXT: v_mov_b32_e32 v22, v19
; GFX9-NEXT: v_mov_b32_e32 v23, v19
; GFX9-NEXT: v_mov_b32_e32 v24, v19
; GFX9-NEXT: v_mov_b32_e32 v25, v19
; GFX9-NEXT: v_mov_b32_e32 v26, v19
; GFX9-NEXT: v_mov_b32_e32 v27, v19
; GFX9-NEXT: v_mov_b32_e32 v28, v19
; GFX9-NEXT: v_mov_b32_e32 v29, v19
; GFX9-NEXT: v_mov_b32_e32 v30, v19
; GFX9-NEXT: v_mov_b32_e32 v31, v19
; GFX9-NEXT: v_mov_b32_e32 v32, v19
; GFX9-NEXT: v_mov_b32_e32 v33, v19
; GFX9-NEXT: v_mov_b32_e32 v34, v19
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB126_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v34, v18
; GFX9-NEXT: v_mov_b32_e32 v33, v17
; GFX9-NEXT: v_mov_b32_e32 v32, v16
; GFX9-NEXT: v_mov_b32_e32 v31, v15
; GFX9-NEXT: v_mov_b32_e32 v30, v14
; GFX9-NEXT: v_mov_b32_e32 v29, v13
; GFX9-NEXT: v_mov_b32_e32 v28, v12
; GFX9-NEXT: v_mov_b32_e32 v27, v11
; GFX9-NEXT: v_mov_b32_e32 v26, v10
; GFX9-NEXT: v_mov_b32_e32 v25, v9
; GFX9-NEXT: v_mov_b32_e32 v24, v8
; GFX9-NEXT: v_mov_b32_e32 v23, v7
; GFX9-NEXT: v_mov_b32_e32 v22, v6
; GFX9-NEXT: v_mov_b32_e32 v21, v5
; GFX9-NEXT: v_mov_b32_e32 v20, v4
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: .LBB126_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8i64_to_v32bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v19, 0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v20, v19
; GFX11-NEXT: v_mov_b32_e32 v21, v19
; GFX11-NEXT: v_mov_b32_e32 v22, v19
; GFX11-NEXT: v_mov_b32_e32 v23, v19
; GFX11-NEXT: v_mov_b32_e32 v24, v19
; GFX11-NEXT: v_mov_b32_e32 v25, v19
; GFX11-NEXT: v_mov_b32_e32 v26, v19
; GFX11-NEXT: v_mov_b32_e32 v27, v19
; GFX11-NEXT: v_mov_b32_e32 v28, v19
; GFX11-NEXT: v_mov_b32_e32 v29, v19
; GFX11-NEXT: v_mov_b32_e32 v30, v19
; GFX11-NEXT: v_mov_b32_e32 v31, v19
; GFX11-NEXT: v_mov_b32_e32 v32, v19
; GFX11-NEXT: v_mov_b32_e32 v33, v19
; GFX11-NEXT: v_mov_b32_e32 v34, v19
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB126_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
; GFX11-NEXT: .LBB126_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <8 x i64> %value to <32 x bfloat>
br label %end
end:
%phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <32 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v8i64_to_v32f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v34, v15
; GCN-NEXT: v_mov_b32_e32 v33, v14
; GCN-NEXT: v_mov_b32_e32 v36, v13
; GCN-NEXT: v_mov_b32_e32 v35, v12
; GCN-NEXT: v_mov_b32_e32 v38, v11
; GCN-NEXT: v_mov_b32_e32 v37, v10
; GCN-NEXT: v_mov_b32_e32 v48, v9
; GCN-NEXT: v_mov_b32_e32 v39, v8
; GCN-NEXT: v_mov_b32_e32 v50, v7
; GCN-NEXT: v_mov_b32_e32 v49, v6
; GCN-NEXT: v_mov_b32_e32 v52, v5
; GCN-NEXT: v_mov_b32_e32 v51, v4
; GCN-NEXT: v_mov_b32_e32 v54, v3
; GCN-NEXT: v_mov_b32_e32 v53, v2
; GCN-NEXT: v_mov_b32_e32 v55, v1
; GCN-NEXT: v_mov_b32_e32 v32, v0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: ; implicit-def: $vgpr2
; GCN-NEXT: ; implicit-def: $vgpr3
; GCN-NEXT: ; implicit-def: $vgpr4
; GCN-NEXT: ; implicit-def: $vgpr5
; GCN-NEXT: ; implicit-def: $vgpr6
; GCN-NEXT: ; implicit-def: $vgpr7
; GCN-NEXT: ; implicit-def: $vgpr8
; GCN-NEXT: ; implicit-def: $vgpr9
; GCN-NEXT: ; implicit-def: $vgpr10
; GCN-NEXT: ; implicit-def: $vgpr11
; GCN-NEXT: ; implicit-def: $vgpr12
; GCN-NEXT: ; implicit-def: $vgpr13
; GCN-NEXT: ; implicit-def: $vgpr14
; GCN-NEXT: ; implicit-def: $vgpr15
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr24
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr26
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr28
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr30
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB127_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51
; GCN-NEXT: s_waitcnt expcnt(3)
; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39
; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49
; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52
; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51
; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54
; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43
; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: .LBB127_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB127_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc
; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc
; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37
; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc
; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35
; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc
; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33
; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35
; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32
; GCN-NEXT: .LBB127_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8i64_to_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB127_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14
; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: .LBB127_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8i64_to_v32f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB127_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14
; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: .LBB127_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8i64_to_v32f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB127_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB127_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = add <8 x i64> %a, splat (i64 3)
%a2 = bitcast <8 x i64> %a1 to <32 x half>
br label %end
cmp.false:
%a3 = bitcast <8 x i64> %a to <32 x half>
br label %end
end:
%phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <32 x half> %phi
}
define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v8i64_to_v32i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v30, v15
; GCN-NEXT: v_mov_b32_e32 v28, v14
; GCN-NEXT: v_mov_b32_e32 v26, v13
; GCN-NEXT: v_mov_b32_e32 v24, v12
; GCN-NEXT: v_mov_b32_e32 v22, v11
; GCN-NEXT: v_mov_b32_e32 v20, v10
; GCN-NEXT: v_mov_b32_e32 v18, v9
; GCN-NEXT: v_mov_b32_e32 v32, v8
; GCN-NEXT: v_mov_b32_e32 v14, v7
; GCN-NEXT: v_mov_b32_e32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v10, v5
; GCN-NEXT: v_mov_b32_e32 v8, v4
; GCN-NEXT: v_mov_b32_e32 v6, v3
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: ; implicit-def: $vgpr3
; GCN-NEXT: ; implicit-def: $vgpr5
; GCN-NEXT: ; implicit-def: $vgpr7
; GCN-NEXT: ; implicit-def: $vgpr9
; GCN-NEXT: ; implicit-def: $vgpr11
; GCN-NEXT: ; implicit-def: $vgpr13
; GCN-NEXT: ; implicit-def: $vgpr15
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB128_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16
; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16
; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16
; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16
; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GCN-NEXT: .LBB128_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB128_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8
; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12
; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32
; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc
; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20
; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24
; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc
; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16
; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16
; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16
; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16
; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16
; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16
; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GCN-NEXT: .LBB128_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v16, v32
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8i64_to_v32i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB128_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14
; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc
; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: .LBB128_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8i64_to_v32i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB128_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14
; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: .LBB128_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8i64_to_v32i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB128_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB128_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = add <8 x i64> %a, splat (i64 3)
%a2 = bitcast <8 x i64> %a1 to <32 x i16>
br label %end
cmp.false:
%a3 = bitcast <8 x i64> %a to <32 x i16>
br label %end
end:
%phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <32 x i16> %phi
}
define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v8f64_to_v32i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v55, v15
; GCN-NEXT: v_mov_b32_e32 v54, v14
; GCN-NEXT: v_mov_b32_e32 v53, v13
; GCN-NEXT: v_mov_b32_e32 v52, v12
; GCN-NEXT: v_mov_b32_e32 v51, v11
; GCN-NEXT: v_mov_b32_e32 v50, v10
; GCN-NEXT: v_mov_b32_e32 v49, v9
; GCN-NEXT: v_mov_b32_e32 v48, v8
; GCN-NEXT: v_mov_b32_e32 v38, v7
; GCN-NEXT: v_mov_b32_e32 v37, v6
; GCN-NEXT: v_mov_b32_e32 v36, v5
; GCN-NEXT: v_mov_b32_e32 v35, v4
; GCN-NEXT: v_mov_b32_e32 v34, v3
; GCN-NEXT: v_mov_b32_e32 v33, v2
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr3
; GCN-NEXT: ; implicit-def: $vgpr5
; GCN-NEXT: ; implicit-def: $vgpr7
; GCN-NEXT: ; implicit-def: $vgpr9
; GCN-NEXT: ; implicit-def: $vgpr11
; GCN-NEXT: ; implicit-def: $vgpr13
; GCN-NEXT: ; implicit-def: $vgpr15
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB129_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16
; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16
; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16
; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16
; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16
; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16
; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16
; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NEXT: .LBB129_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB129_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0
; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0
; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0
; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0
; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0
; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0
; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0
; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16
; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16
; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16
; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16
; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16
; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16
; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16
; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NEXT: .LBB129_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v4, v33
; GCN-NEXT: v_mov_b32_e32 v6, v34
; GCN-NEXT: v_mov_b32_e32 v8, v35
; GCN-NEXT: v_mov_b32_e32 v10, v36
; GCN-NEXT: v_mov_b32_e32 v12, v37
; GCN-NEXT: v_mov_b32_e32 v14, v38
; GCN-NEXT: v_mov_b32_e32 v16, v48
; GCN-NEXT: v_mov_b32_e32 v18, v49
; GCN-NEXT: v_mov_b32_e32 v20, v50
; GCN-NEXT: v_mov_b32_e32 v22, v51
; GCN-NEXT: v_mov_b32_e32 v24, v52
; GCN-NEXT: v_mov_b32_e32 v26, v53
; GCN-NEXT: v_mov_b32_e32 v28, v54
; GCN-NEXT: v_mov_b32_e32 v30, v55
; GCN-NEXT: v_mov_b32_e32 v1, v32
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8f64_to_v32i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB129_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; VI-NEXT: .LBB129_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8f64_to_v32i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB129_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: .LBB129_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8f64_to_v32i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB129_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: .LBB129_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
%a2 = bitcast <8 x double> %a1 to <32 x i16>
br label %end
cmp.false:
%a3 = bitcast <8 x double> %a to <32 x i16>
br label %end
end:
%phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <32 x i16> %phi
}
define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v8f64_to_v32f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr24
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr26
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr28
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr30
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB130_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2
; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50
; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51
; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52
; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53
; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54
; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55
; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40
; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41
; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42
; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr2
; GCN-NEXT: ; implicit-def: $vgpr4
; GCN-NEXT: ; implicit-def: $vgpr6
; GCN-NEXT: ; implicit-def: $vgpr8
; GCN-NEXT: ; implicit-def: $vgpr10
; GCN-NEXT: ; implicit-def: $vgpr12
; GCN-NEXT: ; implicit-def: $vgpr14
; GCN-NEXT: .LBB130_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB130_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2
; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48
; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49
; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50
; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51
; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52
; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53
; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54
; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55
; GCN-NEXT: .LBB130_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v39
; GCN-NEXT: v_mov_b32_e32 v1, v55
; GCN-NEXT: v_mov_b32_e32 v2, v32
; GCN-NEXT: v_mov_b32_e32 v3, v54
; GCN-NEXT: v_mov_b32_e32 v4, v33
; GCN-NEXT: v_mov_b32_e32 v5, v53
; GCN-NEXT: v_mov_b32_e32 v6, v34
; GCN-NEXT: v_mov_b32_e32 v7, v52
; GCN-NEXT: v_mov_b32_e32 v8, v35
; GCN-NEXT: v_mov_b32_e32 v9, v51
; GCN-NEXT: v_mov_b32_e32 v10, v36
; GCN-NEXT: v_mov_b32_e32 v11, v50
; GCN-NEXT: v_mov_b32_e32 v12, v37
; GCN-NEXT: v_mov_b32_e32 v13, v49
; GCN-NEXT: v_mov_b32_e32 v14, v38
; GCN-NEXT: v_mov_b32_e32 v15, v48
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v8f64_to_v32f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB130_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; VI-NEXT: .LBB130_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v8f64_to_v32f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB130_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: .LBB130_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v8f64_to_v32f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB130_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: .LBB130_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = fadd <8 x double> %a, splat (double 1.000000e+00)
%a2 = bitcast <8 x double> %a1 to <32 x half>
br label %end
cmp.false:
%a3 = bitcast <8 x double> %a to <32 x half>
br label %end
end:
%phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <32 x half> %phi
}
define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v32f16_to_v8i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB131_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43
; GCN-NEXT: v_or_b32_e32 v0, v44, v0
; GCN-NEXT: v_or_b32_e32 v1, v42, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20
; GCN-NEXT: v_or_b32_e32 v2, v52, v2
; GCN-NEXT: v_or_b32_e32 v3, v50, v3
; GCN-NEXT: v_or_b32_e32 v4, v48, v4
; GCN-NEXT: v_or_b32_e32 v5, v38, v5
; GCN-NEXT: v_or_b32_e32 v6, v36, v6
; GCN-NEXT: v_or_b32_e32 v7, v34, v7
; GCN-NEXT: v_or_b32_e32 v8, v33, v8
; GCN-NEXT: v_or_b32_e32 v9, v32, v9
; GCN-NEXT: v_or_b32_e32 v10, v31, v10
; GCN-NEXT: v_or_b32_e32 v11, v21, v11
; GCN-NEXT: v_or_b32_e32 v12, v19, v12
; GCN-NEXT: v_or_b32_e32 v13, v18, v13
; GCN-NEXT: v_or_b32_e32 v14, v17, v14
; GCN-NEXT: v_or_b32_e32 v15, v16, v15
; GCN-NEXT: ; implicit-def: $vgpr45
; GCN-NEXT: ; implicit-def: $vgpr44
; GCN-NEXT: ; implicit-def: $vgpr43
; GCN-NEXT: ; implicit-def: $vgpr42
; GCN-NEXT: ; implicit-def: $vgpr41
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr40
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: .LBB131_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB131_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42
; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_or_b32_e32 v1, v3, v2
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52
; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40
; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50
; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48
; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38
; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34
; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49
; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v3, v5, v4
; GCN-NEXT: v_or_b32_e32 v4, v7, v6
; GCN-NEXT: v_or_b32_e32 v5, v9, v8
; GCN-NEXT: v_or_b32_e32 v6, v11, v10
; GCN-NEXT: v_or_b32_e32 v7, v13, v12
; GCN-NEXT: v_or_b32_e32 v8, v15, v14
; GCN-NEXT: v_or_b32_e32 v9, v26, v24
; GCN-NEXT: v_or_b32_e32 v10, v28, v27
; GCN-NEXT: v_or_b32_e32 v11, v21, v29
; GCN-NEXT: v_or_b32_e32 v12, v19, v25
; GCN-NEXT: v_or_b32_e32 v13, v18, v23
; GCN-NEXT: v_or_b32_e32 v14, v17, v22
; GCN-NEXT: v_or_b32_e32 v15, v16, v20
; GCN-NEXT: .LBB131_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32f16_to_v8i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB131_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v16, 0x200
; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
; VI-NEXT: v_or_b32_e32 v15, v15, v17
; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v14, 0x200, v14
; VI-NEXT: v_or_b32_e32 v14, v14, v17
; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
; VI-NEXT: v_or_b32_e32 v13, v13, v17
; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v12, 0x200, v12
; VI-NEXT: v_or_b32_e32 v12, v12, v17
; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
; VI-NEXT: v_or_b32_e32 v11, v11, v17
; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v10, 0x200, v10
; VI-NEXT: v_or_b32_e32 v10, v10, v17
; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
; VI-NEXT: v_or_b32_e32 v9, v9, v17
; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
; VI-NEXT: v_or_b32_e32 v8, v8, v17
; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_or_b32_e32 v7, v7, v17
; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
; VI-NEXT: v_or_b32_e32 v6, v6, v17
; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v5, 0x200, v5
; VI-NEXT: v_or_b32_e32 v5, v5, v17
; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v4, 0x200, v4
; VI-NEXT: v_or_b32_e32 v4, v4, v17
; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
; VI-NEXT: v_or_b32_e32 v3, v3, v17
; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v17
; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 0x200, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v17
; VI-NEXT: v_or_b32_e32 v0, v0, v16
; VI-NEXT: .LBB131_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32f16_to_v8i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB131_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: s_movk_i32 s6, 0x200
; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
; GFX9-NEXT: .LBB131_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32f16_to_v8i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB131_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-NEXT: .LBB131_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = fadd <32 x half> %a, splat (half 0xH0200)
%a2 = bitcast <32 x half> %a1 to <8 x i64>
br label %end
cmp.false:
%a3 = bitcast <32 x half> %a to <8 x i64>
br label %end
end:
%phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <8 x i64> %phi
}
define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v32f16_to_v8f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB132_2
; GCN-NEXT: ; %bb.1: ; %cmp.false
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43
; GCN-NEXT: v_or_b32_e32 v0, v44, v0
; GCN-NEXT: v_or_b32_e32 v1, v42, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20
; GCN-NEXT: v_or_b32_e32 v2, v52, v2
; GCN-NEXT: v_or_b32_e32 v3, v50, v3
; GCN-NEXT: v_or_b32_e32 v4, v48, v4
; GCN-NEXT: v_or_b32_e32 v5, v38, v5
; GCN-NEXT: v_or_b32_e32 v6, v36, v6
; GCN-NEXT: v_or_b32_e32 v7, v34, v7
; GCN-NEXT: v_or_b32_e32 v8, v33, v8
; GCN-NEXT: v_or_b32_e32 v9, v32, v9
; GCN-NEXT: v_or_b32_e32 v10, v31, v10
; GCN-NEXT: v_or_b32_e32 v11, v21, v11
; GCN-NEXT: v_or_b32_e32 v12, v19, v12
; GCN-NEXT: v_or_b32_e32 v13, v18, v13
; GCN-NEXT: v_or_b32_e32 v14, v17, v14
; GCN-NEXT: v_or_b32_e32 v15, v16, v15
; GCN-NEXT: ; implicit-def: $vgpr45
; GCN-NEXT: ; implicit-def: $vgpr44
; GCN-NEXT: ; implicit-def: $vgpr43
; GCN-NEXT: ; implicit-def: $vgpr42
; GCN-NEXT: ; implicit-def: $vgpr41
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr40
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: .LBB132_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB132_4
; GCN-NEXT: ; %bb.3: ; %cmp.true
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42
; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_or_b32_e32 v1, v3, v2
; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52
; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40
; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50
; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55
; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48
; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54
; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38
; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53
; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36
; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51
; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34
; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49
; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33
; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39
; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32
; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37
; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31
; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35
; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28
; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_or_b32_e32 v3, v5, v4
; GCN-NEXT: v_or_b32_e32 v4, v7, v6
; GCN-NEXT: v_or_b32_e32 v5, v9, v8
; GCN-NEXT: v_or_b32_e32 v6, v11, v10
; GCN-NEXT: v_or_b32_e32 v7, v13, v12
; GCN-NEXT: v_or_b32_e32 v8, v15, v14
; GCN-NEXT: v_or_b32_e32 v9, v26, v24
; GCN-NEXT: v_or_b32_e32 v10, v28, v27
; GCN-NEXT: v_or_b32_e32 v11, v21, v29
; GCN-NEXT: v_or_b32_e32 v12, v19, v25
; GCN-NEXT: v_or_b32_e32 v13, v18, v23
; GCN-NEXT: v_or_b32_e32 v14, v17, v22
; GCN-NEXT: v_or_b32_e32 v15, v16, v20
; GCN-NEXT: .LBB132_4: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32f16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB132_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v16, 0x200
; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
; VI-NEXT: v_or_b32_e32 v15, v15, v17
; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v14, 0x200, v14
; VI-NEXT: v_or_b32_e32 v14, v14, v17
; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v13, 0x200, v13
; VI-NEXT: v_or_b32_e32 v13, v13, v17
; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v12, 0x200, v12
; VI-NEXT: v_or_b32_e32 v12, v12, v17
; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v11, 0x200, v11
; VI-NEXT: v_or_b32_e32 v11, v11, v17
; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v10, 0x200, v10
; VI-NEXT: v_or_b32_e32 v10, v10, v17
; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v9, 0x200, v9
; VI-NEXT: v_or_b32_e32 v9, v9, v17
; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v8, 0x200, v8
; VI-NEXT: v_or_b32_e32 v8, v8, v17
; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v7, 0x200, v7
; VI-NEXT: v_or_b32_e32 v7, v7, v17
; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v6, 0x200, v6
; VI-NEXT: v_or_b32_e32 v6, v6, v17
; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v5, 0x200, v5
; VI-NEXT: v_or_b32_e32 v5, v5, v17
; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v4, 0x200, v4
; VI-NEXT: v_or_b32_e32 v4, v4, v17
; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v3, 0x200, v3
; VI-NEXT: v_or_b32_e32 v3, v3, v17
; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v2, 0x200, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v17
; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v1, 0x200, v1
; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 0x200, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v17
; VI-NEXT: v_or_b32_e32 v0, v0, v16
; VI-NEXT: .LBB132_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32f16_to_v8f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB132_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: s_movk_i32 s6, 0x200
; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
; GFX9-NEXT: .LBB132_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32f16_to_v8f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB132_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
; GFX11-NEXT: .LBB132_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = fadd <32 x half> %a, splat (half 0xH0200)
%a2 = bitcast <32 x half> %a1 to <8 x double>
br label %end
cmp.false:
%a3 = bitcast <32 x half> %a to <8 x double>
br label %end
end:
%phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <8 x double> %phi
}
define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v32i16_to_v8i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v38, v14
; GCN-NEXT: v_mov_b32_e32 v37, v12
; GCN-NEXT: v_mov_b32_e32 v36, v10
; GCN-NEXT: v_mov_b32_e32 v35, v8
; GCN-NEXT: v_mov_b32_e32 v34, v6
; GCN-NEXT: v_mov_b32_e32 v33, v4
; GCN-NEXT: v_mov_b32_e32 v32, v2
; GCN-NEXT: v_mov_b32_e32 v31, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB133_3
; GCN-NEXT: ; %bb.1: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB133_4
; GCN-NEXT: .LBB133_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: .LBB133_3: ; %cmp.false
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32
; GCN-NEXT: v_or_b32_e32 v0, v0, v54
; GCN-NEXT: v_or_b32_e32 v1, v1, v55
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35
; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36
; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38
; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16
; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20
; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22
; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24
; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26
; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28
; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30
; GCN-NEXT: v_or_b32_e32 v2, v2, v39
; GCN-NEXT: v_or_b32_e32 v3, v3, v48
; GCN-NEXT: v_or_b32_e32 v4, v4, v49
; GCN-NEXT: v_or_b32_e32 v5, v5, v50
; GCN-NEXT: v_or_b32_e32 v6, v6, v51
; GCN-NEXT: v_or_b32_e32 v7, v7, v52
; GCN-NEXT: v_or_b32_e32 v8, v8, v17
; GCN-NEXT: v_or_b32_e32 v9, v9, v19
; GCN-NEXT: v_or_b32_e32 v10, v10, v21
; GCN-NEXT: v_or_b32_e32 v11, v11, v23
; GCN-NEXT: v_or_b32_e32 v12, v12, v25
; GCN-NEXT: v_or_b32_e32 v13, v13, v27
; GCN-NEXT: v_or_b32_e32 v14, v14, v29
; GCN-NEXT: v_or_b32_e32 v15, v15, v53
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr24
; GCN-NEXT: ; implicit-def: $vgpr26
; GCN-NEXT: ; implicit-def: $vgpr28
; GCN-NEXT: ; implicit-def: $vgpr30
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB133_2
; GCN-NEXT: .LBB133_4: ; %cmp.true
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31
; GCN-NEXT: s_mov_b32 s6, 0x30000
; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32
; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33
; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34
; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35
; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36
; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37
; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38
; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16
; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18
; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20
; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22
; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24
; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26
; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28
; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GCN-NEXT: v_or_b32_e32 v0, v54, v0
; GCN-NEXT: v_or_b32_e32 v1, v55, v1
; GCN-NEXT: v_or_b32_e32 v2, v39, v2
; GCN-NEXT: v_or_b32_e32 v3, v48, v3
; GCN-NEXT: v_or_b32_e32 v4, v49, v4
; GCN-NEXT: v_or_b32_e32 v5, v50, v5
; GCN-NEXT: v_or_b32_e32 v6, v51, v6
; GCN-NEXT: v_or_b32_e32 v7, v52, v7
; GCN-NEXT: v_or_b32_e32 v8, v17, v8
; GCN-NEXT: v_or_b32_e32 v9, v19, v9
; GCN-NEXT: v_or_b32_e32 v10, v21, v10
; GCN-NEXT: v_or_b32_e32 v11, v23, v11
; GCN-NEXT: v_or_b32_e32 v12, v25, v12
; GCN-NEXT: v_or_b32_e32 v13, v27, v13
; GCN-NEXT: v_or_b32_e32 v14, v29, v14
; GCN-NEXT: v_or_b32_e32 v15, v53, v15
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1
; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3
; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5
; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6
; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7
; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8
; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9
; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10
; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11
; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12
; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13
; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32i16_to_v8i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB133_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v17, 3
; VI-NEXT: v_add_u16_e32 v16, 3, v15
; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v15, v16, v15
; VI-NEXT: v_add_u16_e32 v16, 3, v14
; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v14, v16, v14
; VI-NEXT: v_add_u16_e32 v16, 3, v13
; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v13, v16, v13
; VI-NEXT: v_add_u16_e32 v16, 3, v12
; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v12, v16, v12
; VI-NEXT: v_add_u16_e32 v16, 3, v11
; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v11, v16, v11
; VI-NEXT: v_add_u16_e32 v16, 3, v10
; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v10, v16, v10
; VI-NEXT: v_add_u16_e32 v16, 3, v9
; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v9, v16, v9
; VI-NEXT: v_add_u16_e32 v16, 3, v8
; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v8, v16, v8
; VI-NEXT: v_add_u16_e32 v16, 3, v7
; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v16, v7
; VI-NEXT: v_add_u16_e32 v16, 3, v6
; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v6, v16, v6
; VI-NEXT: v_add_u16_e32 v16, 3, v5
; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v5, v16, v5
; VI-NEXT: v_add_u16_e32 v16, 3, v4
; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v4, v16, v4
; VI-NEXT: v_add_u16_e32 v16, 3, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, v16, v3
; VI-NEXT: v_add_u16_e32 v16, 3, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v16, v2
; VI-NEXT: v_add_u16_e32 v16, 3, v1
; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v16, v1
; VI-NEXT: v_add_u16_e32 v16, 3, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v16, v0
; VI-NEXT: .LBB133_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32i16_to_v8i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB133_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX9-NEXT: .LBB133_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32i16_to_v8i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB133_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-NEXT: .LBB133_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = add <32 x i16> %a, splat (i16 3)
%a2 = bitcast <32 x i16> %a1 to <8 x i64>
br label %end
cmp.false:
%a3 = bitcast <32 x i16> %a to <8 x i64>
br label %end
end:
%phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <8 x i64> %phi
}
define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
; GCN-LABEL: v_bitcast_v32i16_to_v8f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v38, v14
; GCN-NEXT: v_mov_b32_e32 v37, v12
; GCN-NEXT: v_mov_b32_e32 v36, v10
; GCN-NEXT: v_mov_b32_e32 v35, v8
; GCN-NEXT: v_mov_b32_e32 v34, v6
; GCN-NEXT: v_mov_b32_e32 v33, v4
; GCN-NEXT: v_mov_b32_e32 v32, v2
; GCN-NEXT: v_mov_b32_e32 v31, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB134_3
; GCN-NEXT: ; %bb.1: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB134_4
; GCN-NEXT: .LBB134_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: .LBB134_3: ; %cmp.false
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32
; GCN-NEXT: v_or_b32_e32 v0, v0, v54
; GCN-NEXT: v_or_b32_e32 v1, v1, v55
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35
; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36
; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38
; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16
; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20
; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22
; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24
; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26
; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28
; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30
; GCN-NEXT: v_or_b32_e32 v2, v2, v39
; GCN-NEXT: v_or_b32_e32 v3, v3, v48
; GCN-NEXT: v_or_b32_e32 v4, v4, v49
; GCN-NEXT: v_or_b32_e32 v5, v5, v50
; GCN-NEXT: v_or_b32_e32 v6, v6, v51
; GCN-NEXT: v_or_b32_e32 v7, v7, v52
; GCN-NEXT: v_or_b32_e32 v8, v8, v17
; GCN-NEXT: v_or_b32_e32 v9, v9, v19
; GCN-NEXT: v_or_b32_e32 v10, v10, v21
; GCN-NEXT: v_or_b32_e32 v11, v11, v23
; GCN-NEXT: v_or_b32_e32 v12, v12, v25
; GCN-NEXT: v_or_b32_e32 v13, v13, v27
; GCN-NEXT: v_or_b32_e32 v14, v14, v29
; GCN-NEXT: v_or_b32_e32 v15, v15, v53
; GCN-NEXT: ; implicit-def: $vgpr31
; GCN-NEXT: ; implicit-def: $vgpr32
; GCN-NEXT: ; implicit-def: $vgpr33
; GCN-NEXT: ; implicit-def: $vgpr34
; GCN-NEXT: ; implicit-def: $vgpr35
; GCN-NEXT: ; implicit-def: $vgpr36
; GCN-NEXT: ; implicit-def: $vgpr37
; GCN-NEXT: ; implicit-def: $vgpr38
; GCN-NEXT: ; implicit-def: $vgpr16
; GCN-NEXT: ; implicit-def: $vgpr18
; GCN-NEXT: ; implicit-def: $vgpr20
; GCN-NEXT: ; implicit-def: $vgpr22
; GCN-NEXT: ; implicit-def: $vgpr24
; GCN-NEXT: ; implicit-def: $vgpr26
; GCN-NEXT: ; implicit-def: $vgpr28
; GCN-NEXT: ; implicit-def: $vgpr30
; GCN-NEXT: ; implicit-def: $vgpr54
; GCN-NEXT: ; implicit-def: $vgpr55
; GCN-NEXT: ; implicit-def: $vgpr39
; GCN-NEXT: ; implicit-def: $vgpr48
; GCN-NEXT: ; implicit-def: $vgpr49
; GCN-NEXT: ; implicit-def: $vgpr50
; GCN-NEXT: ; implicit-def: $vgpr51
; GCN-NEXT: ; implicit-def: $vgpr52
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr19
; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: ; implicit-def: $vgpr23
; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: ; implicit-def: $vgpr27
; GCN-NEXT: ; implicit-def: $vgpr29
; GCN-NEXT: ; implicit-def: $vgpr53
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB134_2
; GCN-NEXT: .LBB134_4: ; %cmp.true
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31
; GCN-NEXT: s_mov_b32 s6, 0x30000
; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32
; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33
; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34
; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35
; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36
; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37
; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38
; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16
; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18
; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20
; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22
; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24
; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26
; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28
; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GCN-NEXT: v_or_b32_e32 v0, v54, v0
; GCN-NEXT: v_or_b32_e32 v1, v55, v1
; GCN-NEXT: v_or_b32_e32 v2, v39, v2
; GCN-NEXT: v_or_b32_e32 v3, v48, v3
; GCN-NEXT: v_or_b32_e32 v4, v49, v4
; GCN-NEXT: v_or_b32_e32 v5, v50, v5
; GCN-NEXT: v_or_b32_e32 v6, v51, v6
; GCN-NEXT: v_or_b32_e32 v7, v52, v7
; GCN-NEXT: v_or_b32_e32 v8, v17, v8
; GCN-NEXT: v_or_b32_e32 v9, v19, v9
; GCN-NEXT: v_or_b32_e32 v10, v21, v10
; GCN-NEXT: v_or_b32_e32 v11, v23, v11
; GCN-NEXT: v_or_b32_e32 v12, v25, v12
; GCN-NEXT: v_or_b32_e32 v13, v27, v13
; GCN-NEXT: v_or_b32_e32 v14, v29, v14
; GCN-NEXT: v_or_b32_e32 v15, v53, v15
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1
; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3
; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4
; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5
; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6
; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7
; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8
; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9
; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10
; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11
; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12
; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13
; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32i16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB134_2
; VI-NEXT: ; %bb.1: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v17, 3
; VI-NEXT: v_add_u16_e32 v16, 3, v15
; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v15, v16, v15
; VI-NEXT: v_add_u16_e32 v16, 3, v14
; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v14, v16, v14
; VI-NEXT: v_add_u16_e32 v16, 3, v13
; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v13, v16, v13
; VI-NEXT: v_add_u16_e32 v16, 3, v12
; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v12, v16, v12
; VI-NEXT: v_add_u16_e32 v16, 3, v11
; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v11, v16, v11
; VI-NEXT: v_add_u16_e32 v16, 3, v10
; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v10, v16, v10
; VI-NEXT: v_add_u16_e32 v16, 3, v9
; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v9, v16, v9
; VI-NEXT: v_add_u16_e32 v16, 3, v8
; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v8, v16, v8
; VI-NEXT: v_add_u16_e32 v16, 3, v7
; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v7, v16, v7
; VI-NEXT: v_add_u16_e32 v16, 3, v6
; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v6, v16, v6
; VI-NEXT: v_add_u16_e32 v16, 3, v5
; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v5, v16, v5
; VI-NEXT: v_add_u16_e32 v16, 3, v4
; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v4, v16, v4
; VI-NEXT: v_add_u16_e32 v16, 3, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, v16, v3
; VI-NEXT: v_add_u16_e32 v16, 3, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v16, v2
; VI-NEXT: v_add_u16_e32 v16, 3, v1
; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v16, v1
; VI-NEXT: v_add_u16_e32 v16, 3, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v16, v0
; VI-NEXT: .LBB134_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32i16_to_v8f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB134_2
; GFX9-NEXT: ; %bb.1: ; %cmp.true
; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX9-NEXT: .LBB134_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32i16_to_v8f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB134_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
; GFX11-NEXT: .LBB134_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
cmp.true:
%a1 = add <32 x i16> %a, splat (i16 3)
%a2 = bitcast <32 x i16> %a1 to <8 x double>
br label %end
cmp.false:
%a3 = bitcast <32 x i16> %a to <8 x double>
br label %end
end:
%phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
ret <8 x double> %phi
}
define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x float> %value) {
; GCN-LABEL: v_bitcast_v32f32_to_v64bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB135_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8
; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7
; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6
; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5
; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
; GCN-NEXT: .LBB135_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32f32_to_v64bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB135_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB135_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32f32_to_v64bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB135_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB135_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32f32_to_v64bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB135_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB135_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x float> %value to <64 x bfloat>
br label %end
end:
%phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <64 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x i32> %value) {
; GCN-LABEL: v_bitcast_v32i32_to_v64bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v35, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB136_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17
; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16
; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16
; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15
; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14
; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13
; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12
; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10
; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9
; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8
; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8
; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7
; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7
; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6
; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5
; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
; GCN-NEXT: .LBB136_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36
; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v32i32_to_v64bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB136_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB136_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v32i32_to_v64bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB136_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB136_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v32i32_to_v64bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB136_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB136_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <32 x i32> %value to <64 x bfloat>
br label %end
end:
%phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <64 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x i16> %value) {
; GCN-LABEL: v_bitcast_v64i16_to_v64bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104
; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt expcnt(3)
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
; GCN-NEXT: s_waitcnt expcnt(2)
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt expcnt(1)
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB137_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6
; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v12
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v13
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v17
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v18
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v22
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v23
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v55
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v54
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v26
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v27
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v51
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v50
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v48
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v39
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v38
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v36
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v32
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v63
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: .LBB137_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64i16_to_v64bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB137_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB137_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64i16_to_v64bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB137_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB137_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64i16_to_v64bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB137_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB137_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x i16> %value to <64 x bfloat>
br label %end
end:
%phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <64 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x half> %value) {
; GCN-LABEL: v_bitcast_v64f16_to_v64bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v29, v16
; GCN-NEXT: v_mov_b32_e32 v16, v15
; GCN-NEXT: v_mov_b32_e32 v15, v14
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v59, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v34, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB138_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v59, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v56, v5
; GCN-NEXT: v_cvt_f16_f32_e32 v58, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v45, v7
; GCN-NEXT: v_cvt_f16_f32_e32 v47, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v44, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v46, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v41, v11
; GCN-NEXT: v_cvt_f16_f32_e32 v43, v12
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v13
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16
; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v51, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50
; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61
; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32
; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48
; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39
; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60
; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38
; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37
; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36
; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35
; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v53, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v40, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v15, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57
; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59
; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56
; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58
; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47
; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46
; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v17
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v18
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v33
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v29
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v22
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v50
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v49
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v23
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v48
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v25
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v38
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v37
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v26
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v53
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55
; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v40
; GCN-NEXT: v_mov_b32_e32 v40, v3
; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42
; GCN-NEXT: v_mov_b32_e32 v42, v4
; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v16
; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v14
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v0
; GCN-NEXT: .LBB138_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v41
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v40
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v53
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v51
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v52
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v34
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v33
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64f16_to_v64bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB138_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB138_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64f16_to_v64bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB138_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB138_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64f16_to_v64bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB138_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB138_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x half> %value to <64 x bfloat>
br label %end
end:
%phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <64 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128 x i8> %value) {
; GCN-LABEL: v_bitcast_v128i8_to_v64bf16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:224
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v13, 0
; GCN-NEXT: v_mov_b32_e32 v14, 0
; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v62, 0
; GCN-NEXT: v_mov_b32_e32 v33, 0
; GCN-NEXT: v_mov_b32_e32 v10, 0
; GCN-NEXT: v_mov_b32_e32 v27, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v23, 0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: v_mov_b32_e32 v28, 0
; GCN-NEXT: v_mov_b32_e32 v24, 0
; GCN-NEXT: v_mov_b32_e32 v6, 0
; GCN-NEXT: v_mov_b32_e32 v26, 0
; GCN-NEXT: v_mov_b32_e32 v61, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v60, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v19, 0
; GCN-NEXT: v_mov_b32_e32 v25, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v20, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v17, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v18, 0
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v21, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB139_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7
; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8
; GCN-NEXT: v_or_b32_e32 v7, v0, v3
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GCN-NEXT: v_or_b32_e32 v8, v0, v3
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GCN-NEXT: v_or_b32_e32 v24, v0, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48
; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v38
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v29
; GCN-NEXT: v_or_b32_e32 v23, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v37
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v35
; GCN-NEXT: v_or_b32_e32 v17, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v15
; GCN-NEXT: v_or_b32_e32 v18, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v32
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v63
; GCN-NEXT: v_or_b32_e32 v21, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v59
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v58
; GCN-NEXT: v_or_b32_e32 v25, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v57
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v42
; GCN-NEXT: v_or_b32_e32 v30, v3, v4
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v45
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55
; GCN-NEXT: v_or_b32_e32 v35, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GCN-NEXT: v_or_b32_e32 v55, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GCN-NEXT: v_or_b32_e32 v42, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GCN-NEXT: v_or_b32_e32 v45, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GCN-NEXT: v_or_b32_e32 v32, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; GCN-NEXT: v_or_b32_e32 v57, v3, v4
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v14, v4, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v12, v4, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v22, v4, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v4, v4, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v56, v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v33, v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v10, v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v27, v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v5, v5, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v52, v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v9, v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v40, v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v36, v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v49, v6, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_or_b32_e32 v3, v6, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_or_b32_e32 v31, v11, v6
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_or_b32_e32 v28, v11, v6
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_or_b32_e32 v6, v11, v6
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_or_b32_e32 v26, v13, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_or_b32_e32 v61, v13, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_or_b32_e32 v11, v13, v11
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v41, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v60, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v44, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v19, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v54, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v20, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v51, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v50, v15, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_or_b32_e32 v13, v15, v13
; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v8
; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v24
; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0
; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v17
; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v30
; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v35
; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55
; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v42
; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v32
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v57
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; GCN-NEXT: .LBB139_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
; GCN-NEXT: v_alignbit_b32 v12, v7, v8, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v22
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v16
; GCN-NEXT: v_mov_b32_e32 v16, v13
; GCN-NEXT: v_alignbit_b32 v13, v7, v8, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v14, v7, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v62
; GCN-NEXT: v_alignbit_b32 v15, v4, v7, 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v27
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
; GCN-NEXT: v_alignbit_b32 v7, v4, v7, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52
; GCN-NEXT: v_alignbit_b32 v9, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43
; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v36
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_alignbit_b32 v7, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23
; GCN-NEXT: v_alignbit_b32 v8, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v31
; GCN-NEXT: v_alignbit_b32 v9, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
; GCN-NEXT: v_alignbit_b32 v10, v3, v4, 16
; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v61
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v26
; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v53
; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v60
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v41
; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_alignbit_b32 v6, v6, v0, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v18
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v30
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v50
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v128i8_to_v64bf16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v46, s19
; VI-NEXT: v_mov_b32_e32 v45, s18
; VI-NEXT: v_mov_b32_e32 v44, s17
; VI-NEXT: v_mov_b32_e32 v43, s16
; VI-NEXT: v_mov_b32_e32 v42, s15
; VI-NEXT: v_mov_b32_e32 v41, s14
; VI-NEXT: v_mov_b32_e32 v40, s13
; VI-NEXT: v_mov_b32_e32 v39, s12
; VI-NEXT: v_mov_b32_e32 v38, s11
; VI-NEXT: v_mov_b32_e32 v37, s10
; VI-NEXT: v_mov_b32_e32 v36, s9
; VI-NEXT: v_mov_b32_e32 v35, s8
; VI-NEXT: v_mov_b32_e32 v34, s7
; VI-NEXT: v_mov_b32_e32 v33, s6
; VI-NEXT: v_mov_b32_e32 v32, s5
; VI-NEXT: v_mov_b32_e32 v31, s4
; VI-NEXT: v_mov_b32_e32 v62, v46
; VI-NEXT: v_mov_b32_e32 v61, v45
; VI-NEXT: v_mov_b32_e32 v60, v44
; VI-NEXT: v_mov_b32_e32 v59, v43
; VI-NEXT: v_mov_b32_e32 v58, v42
; VI-NEXT: v_mov_b32_e32 v57, v41
; VI-NEXT: v_mov_b32_e32 v56, v40
; VI-NEXT: v_mov_b32_e32 v55, v39
; VI-NEXT: v_mov_b32_e32 v54, v38
; VI-NEXT: v_mov_b32_e32 v53, v37
; VI-NEXT: v_mov_b32_e32 v52, v36
; VI-NEXT: v_mov_b32_e32 v51, v35
; VI-NEXT: v_mov_b32_e32 v50, v34
; VI-NEXT: v_mov_b32_e32 v49, v33
; VI-NEXT: v_mov_b32_e32 v48, v32
; VI-NEXT: v_mov_b32_e32 v47, v31
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116
; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112
; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108
; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104
; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92
; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88
; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84
; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80
; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76
; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68
; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64
; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60
; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56
; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48
; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40
; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32
; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28
; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24
; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB139_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28
; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(8)
; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32
; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33
; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v32, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v33, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35
; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v34, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v35, v35, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37
; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v38
; VI-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v37, v37, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v38, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v30
; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v24
; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v39, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v27
; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v23
; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v40, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29
; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v26
; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v41, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v17
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v25
; VI-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v42, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v20
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19
; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v43, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7
; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63
; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v44, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v45, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v46, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; VI-NEXT: v_or_b32_sdwa v47, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5
; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v48, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v49, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v50, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v56, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v57, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v58, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v59, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v60, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v61, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v62, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: .LBB139_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: flat_store_dwordx4 v[3:4], v[59:62]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[55:58]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[51:54]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[47:50]
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v128i8_to_v64bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v46, s19
; GFX9-NEXT: v_mov_b32_e32 v45, s18
; GFX9-NEXT: v_mov_b32_e32 v44, s17
; GFX9-NEXT: v_mov_b32_e32 v43, s16
; GFX9-NEXT: v_mov_b32_e32 v42, s15
; GFX9-NEXT: v_mov_b32_e32 v41, s14
; GFX9-NEXT: v_mov_b32_e32 v40, s13
; GFX9-NEXT: v_mov_b32_e32 v39, s12
; GFX9-NEXT: v_mov_b32_e32 v38, s11
; GFX9-NEXT: v_mov_b32_e32 v37, s10
; GFX9-NEXT: v_mov_b32_e32 v36, s9
; GFX9-NEXT: v_mov_b32_e32 v35, s8
; GFX9-NEXT: v_mov_b32_e32 v34, s7
; GFX9-NEXT: v_mov_b32_e32 v33, s6
; GFX9-NEXT: v_mov_b32_e32 v32, s5
; GFX9-NEXT: v_mov_b32_e32 v31, s4
; GFX9-NEXT: v_mov_b32_e32 v62, v46
; GFX9-NEXT: v_mov_b32_e32 v61, v45
; GFX9-NEXT: v_mov_b32_e32 v60, v44
; GFX9-NEXT: v_mov_b32_e32 v59, v43
; GFX9-NEXT: v_mov_b32_e32 v58, v42
; GFX9-NEXT: v_mov_b32_e32 v57, v41
; GFX9-NEXT: v_mov_b32_e32 v56, v40
; GFX9-NEXT: v_mov_b32_e32 v55, v39
; GFX9-NEXT: v_mov_b32_e32 v54, v38
; GFX9-NEXT: v_mov_b32_e32 v53, v37
; GFX9-NEXT: v_mov_b32_e32 v52, v36
; GFX9-NEXT: v_mov_b32_e32 v51, v35
; GFX9-NEXT: v_mov_b32_e32 v50, v34
; GFX9-NEXT: v_mov_b32_e32 v49, v33
; GFX9-NEXT: v_mov_b32_e32 v48, v32
; GFX9-NEXT: v_mov_b32_e32 v47, v31
; GFX9-NEXT: s_waitcnt vmcnt(44)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116
; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112
; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108
; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104
; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100
; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96
; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92
; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88
; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84
; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80
; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76
; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68
; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64
; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60
; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56
; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52
; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48
; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40
; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32
; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28
; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24
; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB139_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b32 s6, 0x5040100
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v28
; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32
; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33
; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v32, v34, v33, s6
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v33, v34, v33, s6
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v34, v35, v34, s6
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v35, v36, v35, s6
; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37
; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v36, v37, v36, s6
; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38
; GFX9-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v37, v38, v37, s6
; GFX9-NEXT: v_perm_b32 v38, v11, v12, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v30
; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v24
; GFX9-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v39, v12, v11, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v27
; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v23
; GFX9-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v40, v12, v11, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29
; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v26
; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v41, v4, v11, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v17
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v25
; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v42, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v20
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19
; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v43, v4, v3, s6
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63
; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v44, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22
; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v45, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v46, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; GFX9-NEXT: v_perm_b32 v47, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5
; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v48, v5, v4, s6
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v49, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v50, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v56, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v57, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v58, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v59, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v60, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v61, v3, v0, s6
; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v62, v3, v0, s6
; GFX9-NEXT: .LBB139_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[59:62], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[55:58], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[51:54], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:64
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v128i8_to_v64bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:600
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:596
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:592
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:588
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:584
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:580
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:576
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:572
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:568
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:564
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:560
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:556
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:552
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:548
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:544
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:540
; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:536
; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:532
; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:528
; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:524
; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:520
; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:516
; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:512
; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:508
; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:504
; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:500
; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:496
; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:492
; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:488
; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:484
; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:480
; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:476
; GFX11-NEXT: s_clause 0x12
; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:472
; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:468
; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:464
; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:460
; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:456
; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:452
; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:448
; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:444
; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:440
; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:436
; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:432
; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:428
; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:424
; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:420
; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:416
; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:412
; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:408
; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:404
; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:400
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:396
; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:392
; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:388
; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:384
; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:380
; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:376
; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:372
; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:368
; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:364
; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:360
; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:356
; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:352
; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:348
; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:344
; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:340
; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:336
; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:332
; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:328
; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:324
; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:320
; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:316
; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:312
; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:308
; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:304
; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:300
; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:296
; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:292
; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:288
; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:284
; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:280
; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:276
; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:272
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:268
; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:264
; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:260
; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:256
; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:252
; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:248
; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:244
; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:240
; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:236
; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:232
; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:228
; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:224
; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:220
; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:216
; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:212
; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:208
; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:204
; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:200
; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:196
; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:192
; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:188
; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:184
; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:180
; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:176
; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:172
; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:168
; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:164
; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:160
; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:156
; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:152
; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:148
; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:144
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:140
; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:136
; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:132
; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128
; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:124
; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:120
; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:116
; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:112
; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:108
; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:104
; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:100
; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:96
; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:92
; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:88
; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:84
; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:80
; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:76
; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:72
; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:68
; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:64
; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:60
; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:56
; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:52
; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:48
; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:44
; GFX11-NEXT: scratch_load_u16 v120, off, s32 offset:40
; GFX11-NEXT: scratch_load_u16 v121, off, s32 offset:36
; GFX11-NEXT: scratch_load_u16 v122, off, s32 offset:32
; GFX11-NEXT: scratch_load_u16 v123, off, s32 offset:28
; GFX11-NEXT: scratch_load_u16 v124, off, s32 offset:24
; GFX11-NEXT: scratch_load_u16 v125, off, s32 offset:20
; GFX11-NEXT: scratch_load_u16 v126, off, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_u16 v127, off, s32 offset:12
; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v138, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v46, s15 :: v_dual_mov_b32 v45, s14
; GFX11-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s12
; GFX11-NEXT: v_dual_mov_b32 v42, s11 :: v_dual_mov_b32 v41, s10
; GFX11-NEXT: v_dual_mov_b32 v40, s9 :: v_dual_mov_b32 v39, s8
; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6
; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4
; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2
; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0
; GFX11-NEXT: v_dual_mov_b32 v62, v46 :: v_dual_mov_b32 v61, v45
; GFX11-NEXT: v_dual_mov_b32 v60, v44 :: v_dual_mov_b32 v59, v43
; GFX11-NEXT: v_dual_mov_b32 v58, v42 :: v_dual_mov_b32 v57, v41
; GFX11-NEXT: v_dual_mov_b32 v56, v40 :: v_dual_mov_b32 v55, v39
; GFX11-NEXT: v_dual_mov_b32 v54, v38 :: v_dual_mov_b32 v53, v37
; GFX11-NEXT: v_dual_mov_b32 v52, v36 :: v_dual_mov_b32 v51, v35
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB139_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v10
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v15
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v16
; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12
; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v14
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v17
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v18
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v19
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v20
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v21
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v22
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v23
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v24
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v25
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v26
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v27
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v28
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v29
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v30
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v34, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v35, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v36, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v37, v11, v10, 0x5040100
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v138
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v137
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v136
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v127
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v126
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v125
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v124
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v123
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v122
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v121
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v120
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v111
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v110
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v109
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v108
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v107
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v106
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v105
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v104
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v95
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v38, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v39, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v40, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v41, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v42, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v94
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v93
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v92
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v91
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v90
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v89
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v88
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v79
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v77
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v76
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v75
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v74
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v73
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v72
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v63
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v183
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v182
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v181
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v180
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v43, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v44, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v45, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v46, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v47, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v178
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v177
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v176
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v167
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v166
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v165
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v164
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v163
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v162
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v161
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v160
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v151
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v150
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v149
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v148
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v147
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v146
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v145
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v144
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v48, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v49, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v50, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v51, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v52, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v135
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v134
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v133
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v132
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v131
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v130
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v129
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v128
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v119
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v118
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v117
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v116
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v115
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v114
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v113
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v112
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v103
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v102
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v100
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v53, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v54, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v55, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v56, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v57, v11, v10, 0x5040100
; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v99
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v98
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v97
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v96
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v87
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v86
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v85
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v84
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v83
; GFX11-NEXT: v_lshlrev_b16 v11, 8, v82
; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v81
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v80
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v71
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v70
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v69
; GFX11-NEXT: v_lshlrev_b16 v12, 8, v68
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v67
; GFX11-NEXT: v_lshlrev_b16 v14, 8, v66
; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v65
; GFX11-NEXT: v_lshlrev_b16 v16, 8, v64
; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
; GFX11-NEXT: v_perm_b32 v58, v3, v0, 0x5040100
; GFX11-NEXT: v_perm_b32 v59, v5, v4, 0x5040100
; GFX11-NEXT: v_perm_b32 v60, v7, v6, 0x5040100
; GFX11-NEXT: v_perm_b32 v61, v9, v8, 0x5040100
; GFX11-NEXT: v_perm_b32 v62, v11, v10, 0x5040100
; GFX11-NEXT: .LBB139_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:64
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:400
; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:404
; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:408
; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:412
; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:416
; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:420
; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:424
; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:428
; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:432
; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:436
; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:440
; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:444
; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:448
; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:452
; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:456
; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:460
; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:464
; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:468
; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:472
; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:476
; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:480
; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:484
; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:488
; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:492
; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:496
; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:500
; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:504
; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:508
; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:512
; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:516
; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:520
; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:524
; GFX11-NEXT: s_clause 0x12
; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:528
; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:532
; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:536
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:540
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:544
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:548
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:552
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:556
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:560
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:564
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:568
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:572
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:576
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:580
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:584
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:588
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:592
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:596
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:600
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <128 x i8> %value to <64 x bfloat>
br label %end
end:
%phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
store <64 x bfloat> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v64bf16_to_v64i16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: v_mov_b32_e32 v39, v31
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: v_mov_b32_e32 v42, v31
; GCN-NEXT: v_mov_b32_e32 v43, v31
; GCN-NEXT: v_mov_b32_e32 v44, v31
; GCN-NEXT: v_mov_b32_e32 v45, v31
; GCN-NEXT: v_mov_b32_e32 v46, v31
; GCN-NEXT: v_mov_b32_e32 v56, v31
; GCN-NEXT: v_mov_b32_e32 v57, v31
; GCN-NEXT: v_mov_b32_e32 v58, v31
; GCN-NEXT: v_mov_b32_e32 v59, v31
; GCN-NEXT: v_mov_b32_e32 v60, v31
; GCN-NEXT: v_mov_b32_e32 v61, v31
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB140_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
; GCN-NEXT: .LBB140_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64bf16_to_v64i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB140_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB140_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64bf16_to_v64i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB140_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB140_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64bf16_to_v64i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB140_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB140_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x bfloat> %value to <64 x i16>
br label %end
end:
%phi = phi <64 x i16> [zeroinitializer, %entry], [%cast, %if]
store <64 x i16> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v64bf16_to_v64f16:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: v_mov_b32_e32 v30, 0
; GCN-NEXT: v_mov_b32_e32 v58, 0
; GCN-NEXT: v_mov_b32_e32 v56, 0
; GCN-NEXT: v_mov_b32_e32 v57, 0
; GCN-NEXT: v_mov_b32_e32 v45, 0
; GCN-NEXT: v_mov_b32_e32 v47, 0
; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: v_mov_b32_e32 v46, 0
; GCN-NEXT: v_mov_b32_e32 v41, 0
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: v_mov_b32_e32 v54, 0
; GCN-NEXT: v_mov_b32_e32 v55, 0
; GCN-NEXT: v_mov_b32_e32 v52, 0
; GCN-NEXT: v_mov_b32_e32 v53, 0
; GCN-NEXT: v_mov_b32_e32 v49, 0
; GCN-NEXT: v_mov_b32_e32 v51, 0
; GCN-NEXT: v_mov_b32_e32 v48, 0
; GCN-NEXT: v_mov_b32_e32 v50, 0
; GCN-NEXT: v_mov_b32_e32 v37, 0
; GCN-NEXT: v_mov_b32_e32 v39, 0
; GCN-NEXT: v_mov_b32_e32 v36, 0
; GCN-NEXT: v_mov_b32_e32 v38, 0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v22, 0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB141_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61
; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v33
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v32
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v31
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v63
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v60
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v62
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v59
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v35
; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v61
; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v52
; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v53
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v54
; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v55
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v40
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v41
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v43
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v44
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v46
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v47
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v56
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v57
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v58
; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48
; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38
; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37
; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36
; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3
; GCN-NEXT: v_cvt_f32_f16_e32 v58, v4
; GCN-NEXT: v_cvt_f32_f16_e32 v56, v5
; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6
; GCN-NEXT: v_cvt_f32_f16_e32 v45, v7
; GCN-NEXT: v_cvt_f32_f16_e32 v47, v8
; GCN-NEXT: v_cvt_f32_f16_e32 v44, v9
; GCN-NEXT: v_cvt_f32_f16_e32 v46, v10
; GCN-NEXT: v_cvt_f32_f16_e32 v41, v11
; GCN-NEXT: v_cvt_f32_f16_e32 v43, v12
; GCN-NEXT: v_cvt_f32_f16_e32 v40, v13
; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14
; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15
; GCN-NEXT: v_cvt_f32_f16_e32 v55, v16
; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17
; GCN-NEXT: v_cvt_f32_f16_e32 v53, v18
; GCN-NEXT: v_cvt_f32_f16_e32 v49, v19
; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20
; GCN-NEXT: v_cvt_f32_f16_e32 v48, v21
; GCN-NEXT: v_cvt_f32_f16_e32 v50, v22
; GCN-NEXT: v_cvt_f32_f16_e32 v37, v23
; GCN-NEXT: v_cvt_f32_f16_e32 v39, v24
; GCN-NEXT: v_cvt_f32_f16_e32 v36, v25
; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: .LBB141_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v58
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v57
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v56
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v47
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v42
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v40
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v49
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v48
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v37
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v36
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
; GCN-NEXT: v_or_b32_e32 v3, v3, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
; GCN-NEXT: v_or_b32_e32 v4, v4, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v22
; GCN-NEXT: v_or_b32_e32 v5, v5, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
; GCN-NEXT: v_or_b32_e32 v6, v6, v0
; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64bf16_to_v64f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: s_mov_b32 s4, 0
; VI-NEXT: s_mov_b32 s19, s4
; VI-NEXT: s_mov_b32 s5, s4
; VI-NEXT: s_mov_b32 s6, s4
; VI-NEXT: s_mov_b32 s7, s4
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s4
; VI-NEXT: s_mov_b32 s10, s4
; VI-NEXT: s_mov_b32 s11, s4
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s4
; VI-NEXT: s_mov_b32 s14, s4
; VI-NEXT: s_mov_b32 s15, s4
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s4
; VI-NEXT: s_mov_b32 s18, s4
; VI-NEXT: v_mov_b32_e32 v50, s19
; VI-NEXT: v_mov_b32_e32 v49, s18
; VI-NEXT: v_mov_b32_e32 v48, s17
; VI-NEXT: v_mov_b32_e32 v47, s16
; VI-NEXT: v_mov_b32_e32 v46, s15
; VI-NEXT: v_mov_b32_e32 v45, s14
; VI-NEXT: v_mov_b32_e32 v44, s13
; VI-NEXT: v_mov_b32_e32 v43, s12
; VI-NEXT: v_mov_b32_e32 v42, s11
; VI-NEXT: v_mov_b32_e32 v41, s10
; VI-NEXT: v_mov_b32_e32 v40, s9
; VI-NEXT: v_mov_b32_e32 v39, s8
; VI-NEXT: v_mov_b32_e32 v38, s7
; VI-NEXT: v_mov_b32_e32 v37, s6
; VI-NEXT: v_mov_b32_e32 v36, s5
; VI-NEXT: v_mov_b32_e32 v35, s4
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB141_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: v_mov_b32_e32 v50, v18
; VI-NEXT: v_mov_b32_e32 v49, v17
; VI-NEXT: v_mov_b32_e32 v48, v16
; VI-NEXT: v_mov_b32_e32 v47, v15
; VI-NEXT: v_mov_b32_e32 v46, v14
; VI-NEXT: v_mov_b32_e32 v45, v13
; VI-NEXT: v_mov_b32_e32 v44, v12
; VI-NEXT: v_mov_b32_e32 v43, v11
; VI-NEXT: v_mov_b32_e32 v42, v10
; VI-NEXT: v_mov_b32_e32 v41, v9
; VI-NEXT: v_mov_b32_e32 v40, v8
; VI-NEXT: v_mov_b32_e32 v39, v7
; VI-NEXT: v_mov_b32_e32 v38, v6
; VI-NEXT: v_mov_b32_e32 v37, v5
; VI-NEXT: v_mov_b32_e32 v36, v4
; VI-NEXT: v_mov_b32_e32 v35, v3
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: .LBB141_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: s_movk_i32 s4, 0x70
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x60
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: s_movk_i32 s4, 0x50
; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64bf16_to_v64f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s19, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s7, s4
; GFX9-NEXT: s_mov_b32 s8, s4
; GFX9-NEXT: s_mov_b32 s9, s4
; GFX9-NEXT: s_mov_b32 s10, s4
; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: s_mov_b32 s12, s4
; GFX9-NEXT: s_mov_b32 s13, s4
; GFX9-NEXT: s_mov_b32 s14, s4
; GFX9-NEXT: s_mov_b32 s15, s4
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s4
; GFX9-NEXT: s_mov_b32 s18, s4
; GFX9-NEXT: v_mov_b32_e32 v50, s19
; GFX9-NEXT: v_mov_b32_e32 v49, s18
; GFX9-NEXT: v_mov_b32_e32 v48, s17
; GFX9-NEXT: v_mov_b32_e32 v47, s16
; GFX9-NEXT: v_mov_b32_e32 v46, s15
; GFX9-NEXT: v_mov_b32_e32 v45, s14
; GFX9-NEXT: v_mov_b32_e32 v44, s13
; GFX9-NEXT: v_mov_b32_e32 v43, s12
; GFX9-NEXT: v_mov_b32_e32 v42, s11
; GFX9-NEXT: v_mov_b32_e32 v41, s10
; GFX9-NEXT: v_mov_b32_e32 v40, s9
; GFX9-NEXT: v_mov_b32_e32 v39, s8
; GFX9-NEXT: v_mov_b32_e32 v38, s7
; GFX9-NEXT: v_mov_b32_e32 v37, s6
; GFX9-NEXT: v_mov_b32_e32 v36, s5
; GFX9-NEXT: v_mov_b32_e32 v35, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB141_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: v_mov_b32_e32 v50, v18
; GFX9-NEXT: v_mov_b32_e32 v49, v17
; GFX9-NEXT: v_mov_b32_e32 v48, v16
; GFX9-NEXT: v_mov_b32_e32 v47, v15
; GFX9-NEXT: v_mov_b32_e32 v46, v14
; GFX9-NEXT: v_mov_b32_e32 v45, v13
; GFX9-NEXT: v_mov_b32_e32 v44, v12
; GFX9-NEXT: v_mov_b32_e32 v43, v11
; GFX9-NEXT: v_mov_b32_e32 v42, v10
; GFX9-NEXT: v_mov_b32_e32 v41, v9
; GFX9-NEXT: v_mov_b32_e32 v40, v8
; GFX9-NEXT: v_mov_b32_e32 v39, v7
; GFX9-NEXT: v_mov_b32_e32 v38, v6
; GFX9-NEXT: v_mov_b32_e32 v37, v5
; GFX9-NEXT: v_mov_b32_e32 v36, v4
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: .LBB141_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64bf16_to_v64f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB141_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
; GFX11-NEXT: .LBB141_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x bfloat> %value to <64 x half>
br label %end
end:
%phi = phi <64 x half> [zeroinitializer, %entry], [%cast, %if]
store <64 x half> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v64bf16_to_v128i8:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: v_mov_b32_e32 v39, v31
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: v_mov_b32_e32 v42, v31
; GCN-NEXT: v_mov_b32_e32 v43, v31
; GCN-NEXT: v_mov_b32_e32 v44, v31
; GCN-NEXT: v_mov_b32_e32 v45, v31
; GCN-NEXT: v_mov_b32_e32 v46, v31
; GCN-NEXT: v_mov_b32_e32 v56, v31
; GCN-NEXT: v_mov_b32_e32 v57, v31
; GCN-NEXT: v_mov_b32_e32 v58, v31
; GCN-NEXT: v_mov_b32_e32 v59, v31
; GCN-NEXT: v_mov_b32_e32 v60, v31
; GCN-NEXT: v_mov_b32_e32 v61, v31
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB142_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
; GCN-NEXT: .LBB142_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64bf16_to_v128i8:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: v_mov_b32_e32 v8, v3
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v10, v3
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: v_mov_b32_e32 v12, v3
; VI-NEXT: v_mov_b32_e32 v13, v3
; VI-NEXT: v_mov_b32_e32 v14, v3
; VI-NEXT: v_mov_b32_e32 v15, v3
; VI-NEXT: v_mov_b32_e32 v16, v3
; VI-NEXT: v_mov_b32_e32 v17, v3
; VI-NEXT: v_mov_b32_e32 v18, v3
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: v_mov_b32_e32 v20, v3
; VI-NEXT: v_mov_b32_e32 v21, v3
; VI-NEXT: v_mov_b32_e32 v22, v3
; VI-NEXT: v_mov_b32_e32 v23, v3
; VI-NEXT: v_mov_b32_e32 v24, v3
; VI-NEXT: v_mov_b32_e32 v25, v3
; VI-NEXT: v_mov_b32_e32 v26, v3
; VI-NEXT: v_mov_b32_e32 v27, v3
; VI-NEXT: v_mov_b32_e32 v28, v3
; VI-NEXT: v_mov_b32_e32 v29, v3
; VI-NEXT: v_mov_b32_e32 v30, v3
; VI-NEXT: v_mov_b32_e32 v31, v3
; VI-NEXT: v_mov_b32_e32 v32, v3
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB142_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: .LBB142_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64bf16_to_v128i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v3
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_mov_b32_e32 v10, v3
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: v_mov_b32_e32 v12, v3
; GFX9-NEXT: v_mov_b32_e32 v13, v3
; GFX9-NEXT: v_mov_b32_e32 v14, v3
; GFX9-NEXT: v_mov_b32_e32 v15, v3
; GFX9-NEXT: v_mov_b32_e32 v16, v3
; GFX9-NEXT: v_mov_b32_e32 v17, v3
; GFX9-NEXT: v_mov_b32_e32 v18, v3
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: v_mov_b32_e32 v20, v3
; GFX9-NEXT: v_mov_b32_e32 v21, v3
; GFX9-NEXT: v_mov_b32_e32 v22, v3
; GFX9-NEXT: v_mov_b32_e32 v23, v3
; GFX9-NEXT: v_mov_b32_e32 v24, v3
; GFX9-NEXT: v_mov_b32_e32 v25, v3
; GFX9-NEXT: v_mov_b32_e32 v26, v3
; GFX9-NEXT: v_mov_b32_e32 v27, v3
; GFX9-NEXT: v_mov_b32_e32 v28, v3
; GFX9-NEXT: v_mov_b32_e32 v29, v3
; GFX9-NEXT: v_mov_b32_e32 v30, v3
; GFX9-NEXT: v_mov_b32_e32 v31, v3
; GFX9-NEXT: v_mov_b32_e32 v32, v3
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB142_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: .LBB142_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64bf16_to_v128i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: v_mov_b32_e32 v35, 0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v36, v35
; GFX11-NEXT: v_mov_b32_e32 v37, v35
; GFX11-NEXT: v_mov_b32_e32 v38, v35
; GFX11-NEXT: v_mov_b32_e32 v39, v35
; GFX11-NEXT: v_mov_b32_e32 v40, v35
; GFX11-NEXT: v_mov_b32_e32 v41, v35
; GFX11-NEXT: v_mov_b32_e32 v42, v35
; GFX11-NEXT: v_mov_b32_e32 v43, v35
; GFX11-NEXT: v_mov_b32_e32 v44, v35
; GFX11-NEXT: v_mov_b32_e32 v45, v35
; GFX11-NEXT: v_mov_b32_e32 v46, v35
; GFX11-NEXT: v_mov_b32_e32 v47, v35
; GFX11-NEXT: v_mov_b32_e32 v48, v35
; GFX11-NEXT: v_mov_b32_e32 v49, v35
; GFX11-NEXT: v_mov_b32_e32 v50, v35
; GFX11-NEXT: v_mov_b32_e32 v51, v35
; GFX11-NEXT: v_mov_b32_e32 v52, v35
; GFX11-NEXT: v_mov_b32_e32 v53, v35
; GFX11-NEXT: v_mov_b32_e32 v54, v35
; GFX11-NEXT: v_mov_b32_e32 v55, v35
; GFX11-NEXT: v_mov_b32_e32 v56, v35
; GFX11-NEXT: v_mov_b32_e32 v57, v35
; GFX11-NEXT: v_mov_b32_e32 v58, v35
; GFX11-NEXT: v_mov_b32_e32 v59, v35
; GFX11-NEXT: v_mov_b32_e32 v60, v35
; GFX11-NEXT: v_mov_b32_e32 v61, v35
; GFX11-NEXT: v_mov_b32_e32 v62, v35
; GFX11-NEXT: v_mov_b32_e32 v63, v35
; GFX11-NEXT: v_mov_b32_e32 v64, v35
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_cbranch_execz .LBB142_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
; GFX11-NEXT: .LBB142_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x bfloat> %value to <128 x i8>
br label %end
end:
%phi = phi <128 x i8> [zeroinitializer, %entry], [%cast, %if]
store <128 x i8> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v64bf16_to_v16i64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: v_mov_b32_e32 v39, v31
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: v_mov_b32_e32 v42, v31
; GCN-NEXT: v_mov_b32_e32 v43, v31
; GCN-NEXT: v_mov_b32_e32 v44, v31
; GCN-NEXT: v_mov_b32_e32 v45, v31
; GCN-NEXT: v_mov_b32_e32 v46, v31
; GCN-NEXT: v_mov_b32_e32 v56, v31
; GCN-NEXT: v_mov_b32_e32 v57, v31
; GCN-NEXT: v_mov_b32_e32 v58, v31
; GCN-NEXT: v_mov_b32_e32 v59, v31
; GCN-NEXT: v_mov_b32_e32 v60, v31
; GCN-NEXT: v_mov_b32_e32 v61, v31
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB143_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
; GCN-NEXT: .LBB143_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64bf16_to_v16i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: v_mov_b32_e32 v8, v3
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v10, v3
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: v_mov_b32_e32 v12, v3
; VI-NEXT: v_mov_b32_e32 v13, v3
; VI-NEXT: v_mov_b32_e32 v14, v3
; VI-NEXT: v_mov_b32_e32 v15, v3
; VI-NEXT: v_mov_b32_e32 v16, v3
; VI-NEXT: v_mov_b32_e32 v17, v3
; VI-NEXT: v_mov_b32_e32 v18, v3
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: v_mov_b32_e32 v20, v3
; VI-NEXT: v_mov_b32_e32 v21, v3
; VI-NEXT: v_mov_b32_e32 v22, v3
; VI-NEXT: v_mov_b32_e32 v23, v3
; VI-NEXT: v_mov_b32_e32 v24, v3
; VI-NEXT: v_mov_b32_e32 v25, v3
; VI-NEXT: v_mov_b32_e32 v26, v3
; VI-NEXT: v_mov_b32_e32 v27, v3
; VI-NEXT: v_mov_b32_e32 v28, v3
; VI-NEXT: v_mov_b32_e32 v29, v3
; VI-NEXT: v_mov_b32_e32 v30, v3
; VI-NEXT: v_mov_b32_e32 v31, v3
; VI-NEXT: v_mov_b32_e32 v32, v3
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB143_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: .LBB143_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64bf16_to_v16i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v3
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_mov_b32_e32 v10, v3
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: v_mov_b32_e32 v12, v3
; GFX9-NEXT: v_mov_b32_e32 v13, v3
; GFX9-NEXT: v_mov_b32_e32 v14, v3
; GFX9-NEXT: v_mov_b32_e32 v15, v3
; GFX9-NEXT: v_mov_b32_e32 v16, v3
; GFX9-NEXT: v_mov_b32_e32 v17, v3
; GFX9-NEXT: v_mov_b32_e32 v18, v3
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: v_mov_b32_e32 v20, v3
; GFX9-NEXT: v_mov_b32_e32 v21, v3
; GFX9-NEXT: v_mov_b32_e32 v22, v3
; GFX9-NEXT: v_mov_b32_e32 v23, v3
; GFX9-NEXT: v_mov_b32_e32 v24, v3
; GFX9-NEXT: v_mov_b32_e32 v25, v3
; GFX9-NEXT: v_mov_b32_e32 v26, v3
; GFX9-NEXT: v_mov_b32_e32 v27, v3
; GFX9-NEXT: v_mov_b32_e32 v28, v3
; GFX9-NEXT: v_mov_b32_e32 v29, v3
; GFX9-NEXT: v_mov_b32_e32 v30, v3
; GFX9-NEXT: v_mov_b32_e32 v31, v3
; GFX9-NEXT: v_mov_b32_e32 v32, v3
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB143_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: .LBB143_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64bf16_to_v16i64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: v_mov_b32_e32 v35, 0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v36, v35
; GFX11-NEXT: v_mov_b32_e32 v37, v35
; GFX11-NEXT: v_mov_b32_e32 v38, v35
; GFX11-NEXT: v_mov_b32_e32 v39, v35
; GFX11-NEXT: v_mov_b32_e32 v40, v35
; GFX11-NEXT: v_mov_b32_e32 v41, v35
; GFX11-NEXT: v_mov_b32_e32 v42, v35
; GFX11-NEXT: v_mov_b32_e32 v43, v35
; GFX11-NEXT: v_mov_b32_e32 v44, v35
; GFX11-NEXT: v_mov_b32_e32 v45, v35
; GFX11-NEXT: v_mov_b32_e32 v46, v35
; GFX11-NEXT: v_mov_b32_e32 v47, v35
; GFX11-NEXT: v_mov_b32_e32 v48, v35
; GFX11-NEXT: v_mov_b32_e32 v49, v35
; GFX11-NEXT: v_mov_b32_e32 v50, v35
; GFX11-NEXT: v_mov_b32_e32 v51, v35
; GFX11-NEXT: v_mov_b32_e32 v52, v35
; GFX11-NEXT: v_mov_b32_e32 v53, v35
; GFX11-NEXT: v_mov_b32_e32 v54, v35
; GFX11-NEXT: v_mov_b32_e32 v55, v35
; GFX11-NEXT: v_mov_b32_e32 v56, v35
; GFX11-NEXT: v_mov_b32_e32 v57, v35
; GFX11-NEXT: v_mov_b32_e32 v58, v35
; GFX11-NEXT: v_mov_b32_e32 v59, v35
; GFX11-NEXT: v_mov_b32_e32 v60, v35
; GFX11-NEXT: v_mov_b32_e32 v61, v35
; GFX11-NEXT: v_mov_b32_e32 v62, v35
; GFX11-NEXT: v_mov_b32_e32 v63, v35
; GFX11-NEXT: v_mov_b32_e32 v64, v35
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_cbranch_execz .LBB143_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
; GFX11-NEXT: .LBB143_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x bfloat> %value to <16 x i64>
br label %end
end:
%phi = phi <16 x i64> [zeroinitializer, %entry], [%cast, %if]
store <16 x i64> %phi, ptr addrspace(1) %out
ret void
}
define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
; GCN-LABEL: v_bitcast_v64bf16_to_v16f64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_mov_b32_e32 v31, 0
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mov_b32_e32 v32, v31
; GCN-NEXT: v_mov_b32_e32 v33, v31
; GCN-NEXT: v_mov_b32_e32 v34, v31
; GCN-NEXT: v_mov_b32_e32 v35, v31
; GCN-NEXT: v_mov_b32_e32 v36, v31
; GCN-NEXT: v_mov_b32_e32 v37, v31
; GCN-NEXT: v_mov_b32_e32 v38, v31
; GCN-NEXT: v_mov_b32_e32 v48, v31
; GCN-NEXT: v_mov_b32_e32 v49, v31
; GCN-NEXT: v_mov_b32_e32 v50, v31
; GCN-NEXT: v_mov_b32_e32 v51, v31
; GCN-NEXT: v_mov_b32_e32 v52, v31
; GCN-NEXT: v_mov_b32_e32 v53, v31
; GCN-NEXT: v_mov_b32_e32 v54, v31
; GCN-NEXT: v_mov_b32_e32 v55, v31
; GCN-NEXT: v_mov_b32_e32 v39, v31
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: v_mov_b32_e32 v41, v31
; GCN-NEXT: v_mov_b32_e32 v42, v31
; GCN-NEXT: v_mov_b32_e32 v43, v31
; GCN-NEXT: v_mov_b32_e32 v44, v31
; GCN-NEXT: v_mov_b32_e32 v45, v31
; GCN-NEXT: v_mov_b32_e32 v46, v31
; GCN-NEXT: v_mov_b32_e32 v56, v31
; GCN-NEXT: v_mov_b32_e32 v57, v31
; GCN-NEXT: v_mov_b32_e32 v58, v31
; GCN-NEXT: v_mov_b32_e32 v59, v31
; GCN-NEXT: v_mov_b32_e32 v60, v31
; GCN-NEXT: v_mov_b32_e32 v61, v31
; GCN-NEXT: v_mov_b32_e32 v62, v31
; GCN-NEXT: v_mov_b32_e32 v63, v31
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB144_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
; GCN-NEXT: .LBB144_2: ; %end
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(6)
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(5)
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt expcnt(4)
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bitcast_v64bf16_to_v16f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: v_mov_b32_e32 v5, v3
; VI-NEXT: v_mov_b32_e32 v6, v3
; VI-NEXT: v_mov_b32_e32 v7, v3
; VI-NEXT: v_mov_b32_e32 v8, v3
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v10, v3
; VI-NEXT: v_mov_b32_e32 v11, v3
; VI-NEXT: v_mov_b32_e32 v12, v3
; VI-NEXT: v_mov_b32_e32 v13, v3
; VI-NEXT: v_mov_b32_e32 v14, v3
; VI-NEXT: v_mov_b32_e32 v15, v3
; VI-NEXT: v_mov_b32_e32 v16, v3
; VI-NEXT: v_mov_b32_e32 v17, v3
; VI-NEXT: v_mov_b32_e32 v18, v3
; VI-NEXT: v_mov_b32_e32 v19, v3
; VI-NEXT: v_mov_b32_e32 v20, v3
; VI-NEXT: v_mov_b32_e32 v21, v3
; VI-NEXT: v_mov_b32_e32 v22, v3
; VI-NEXT: v_mov_b32_e32 v23, v3
; VI-NEXT: v_mov_b32_e32 v24, v3
; VI-NEXT: v_mov_b32_e32 v25, v3
; VI-NEXT: v_mov_b32_e32 v26, v3
; VI-NEXT: v_mov_b32_e32 v27, v3
; VI-NEXT: v_mov_b32_e32 v28, v3
; VI-NEXT: v_mov_b32_e32 v29, v3
; VI-NEXT: v_mov_b32_e32 v30, v3
; VI-NEXT: v_mov_b32_e32 v31, v3
; VI-NEXT: v_mov_b32_e32 v32, v3
; VI-NEXT: v_mov_b32_e32 v33, v3
; VI-NEXT: v_mov_b32_e32 v34, v3
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_cbranch_execz .LBB144_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; VI-NEXT: .LBB144_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
; VI-NEXT: s_nop 0
; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_bitcast_v64bf16_to_v16f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_waitcnt vmcnt(28)
; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: v_mov_b32_e32 v6, v3
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v3
; GFX9-NEXT: v_mov_b32_e32 v9, v3
; GFX9-NEXT: v_mov_b32_e32 v10, v3
; GFX9-NEXT: v_mov_b32_e32 v11, v3
; GFX9-NEXT: v_mov_b32_e32 v12, v3
; GFX9-NEXT: v_mov_b32_e32 v13, v3
; GFX9-NEXT: v_mov_b32_e32 v14, v3
; GFX9-NEXT: v_mov_b32_e32 v15, v3
; GFX9-NEXT: v_mov_b32_e32 v16, v3
; GFX9-NEXT: v_mov_b32_e32 v17, v3
; GFX9-NEXT: v_mov_b32_e32 v18, v3
; GFX9-NEXT: v_mov_b32_e32 v19, v3
; GFX9-NEXT: v_mov_b32_e32 v20, v3
; GFX9-NEXT: v_mov_b32_e32 v21, v3
; GFX9-NEXT: v_mov_b32_e32 v22, v3
; GFX9-NEXT: v_mov_b32_e32 v23, v3
; GFX9-NEXT: v_mov_b32_e32 v24, v3
; GFX9-NEXT: v_mov_b32_e32 v25, v3
; GFX9-NEXT: v_mov_b32_e32 v26, v3
; GFX9-NEXT: v_mov_b32_e32 v27, v3
; GFX9-NEXT: v_mov_b32_e32 v28, v3
; GFX9-NEXT: v_mov_b32_e32 v29, v3
; GFX9-NEXT: v_mov_b32_e32 v30, v3
; GFX9-NEXT: v_mov_b32_e32 v31, v3
; GFX9-NEXT: v_mov_b32_e32 v32, v3
; GFX9-NEXT: v_mov_b32_e32 v33, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v3
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB144_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-NEXT: .LBB144_2: ; %end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_bitcast_v64bf16_to_v16f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: v_mov_b32_e32 v35, 0
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v36, v35
; GFX11-NEXT: v_mov_b32_e32 v37, v35
; GFX11-NEXT: v_mov_b32_e32 v38, v35
; GFX11-NEXT: v_mov_b32_e32 v39, v35
; GFX11-NEXT: v_mov_b32_e32 v40, v35
; GFX11-NEXT: v_mov_b32_e32 v41, v35
; GFX11-NEXT: v_mov_b32_e32 v42, v35
; GFX11-NEXT: v_mov_b32_e32 v43, v35
; GFX11-NEXT: v_mov_b32_e32 v44, v35
; GFX11-NEXT: v_mov_b32_e32 v45, v35
; GFX11-NEXT: v_mov_b32_e32 v46, v35
; GFX11-NEXT: v_mov_b32_e32 v47, v35
; GFX11-NEXT: v_mov_b32_e32 v48, v35
; GFX11-NEXT: v_mov_b32_e32 v49, v35
; GFX11-NEXT: v_mov_b32_e32 v50, v35
; GFX11-NEXT: v_mov_b32_e32 v51, v35
; GFX11-NEXT: v_mov_b32_e32 v52, v35
; GFX11-NEXT: v_mov_b32_e32 v53, v35
; GFX11-NEXT: v_mov_b32_e32 v54, v35
; GFX11-NEXT: v_mov_b32_e32 v55, v35
; GFX11-NEXT: v_mov_b32_e32 v56, v35
; GFX11-NEXT: v_mov_b32_e32 v57, v35
; GFX11-NEXT: v_mov_b32_e32 v58, v35
; GFX11-NEXT: v_mov_b32_e32 v59, v35
; GFX11-NEXT: v_mov_b32_e32 v60, v35
; GFX11-NEXT: v_mov_b32_e32 v61, v35
; GFX11-NEXT: v_mov_b32_e32 v62, v35
; GFX11-NEXT: v_mov_b32_e32 v63, v35
; GFX11-NEXT: v_mov_b32_e32 v64, v35
; GFX11-NEXT: v_mov_b32_e32 v65, v35
; GFX11-NEXT: v_mov_b32_e32 v66, v35
; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-NEXT: s_cbranch_execz .LBB144_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
; GFX11-NEXT: .LBB144_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
if:
%cast = bitcast <64 x bfloat> %value to <16 x double>
br label %end
end:
%phi = phi <16 x double> [zeroinitializer, %entry], [%cast, %if]
store <16 x double> %phi, ptr addrspace(1) %out
ret void
}