blob: 084b7a2d59b2f458e07ee90d3cb9f94e658f897f [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX9
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GFX12
define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out, i32 %idx) {
; GFX9-LABEL: test_bitcast_llc_v128i8_v16i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s33, s[4:5], 0x8
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: s_and_b32 s1, s0, 0xff
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_and_b32 s1, s0, 0xffff
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s33, s33, s33
; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: s_lshl_b32 s33, s33, 1
; GFX9-NEXT: s_mov_b32 s2, s0
; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s0
; GFX9-NEXT: s_mov_b32 s6, s0
; GFX9-NEXT: s_mov_b32 s7, s0
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s0
; GFX9-NEXT: s_mov_b32 s10, s0
; GFX9-NEXT: s_mov_b32 s11, s0
; GFX9-NEXT: s_mov_b32 s12, s0
; GFX9-NEXT: s_mov_b32 s13, s0
; GFX9-NEXT: s_mov_b32 s14, s0
; GFX9-NEXT: s_mov_b32 s15, s0
; GFX9-NEXT: s_mov_b32 s16, s0
; GFX9-NEXT: s_mov_b32 s17, s0
; GFX9-NEXT: s_mov_b32 s18, s0
; GFX9-NEXT: s_mov_b32 s19, s0
; GFX9-NEXT: s_mov_b32 s20, s0
; GFX9-NEXT: s_mov_b32 s21, s0
; GFX9-NEXT: s_mov_b32 s22, s0
; GFX9-NEXT: s_mov_b32 s23, s0
; GFX9-NEXT: s_mov_b32 s24, s0
; GFX9-NEXT: s_mov_b32 s25, s0
; GFX9-NEXT: s_mov_b32 s26, s0
; GFX9-NEXT: s_mov_b32 s27, s0
; GFX9-NEXT: s_mov_b32 s28, s0
; GFX9-NEXT: s_mov_b32 s29, s0
; GFX9-NEXT: s_mov_b32 s30, s0
; GFX9-NEXT: s_mov_b32 s31, s0
; GFX9-NEXT: s_add_i32 s36, s33, 3
; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX9-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX9-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
; GFX9-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
; GFX9-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GFX9-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX9-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX9-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
; GFX9-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GFX9-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
; GFX9-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
; GFX9-NEXT: v_mov_b64_e32 v[24:25], s[24:25]
; GFX9-NEXT: v_mov_b64_e32 v[26:27], s[26:27]
; GFX9-NEXT: v_mov_b64_e32 v[28:29], s[28:29]
; GFX9-NEXT: v_mov_b64_e32 v[30:31], s[30:31]
; GFX9-NEXT: s_set_gpr_idx_on s36, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v35, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s0, s33, 2
; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: v_mov_b32_e32 v36, 0
; GFX9-NEXT: s_set_gpr_idx_on s33, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v33, v1
; GFX9-NEXT: v_mov_b32_e32 v32, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v36, v[32:35], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_bitcast_llc_v128i8_v16i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x0
; GFX11-NEXT: s_load_b32 s33, s[4:5], 0x8
; GFX11-NEXT: s_lshl_b32 s0, s0, 8
; GFX11-NEXT: v_mov_b32_e32 v35, 0
; GFX11-NEXT: s_and_b32 s1, s0, 0xff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_and_b32 s1, s0, 0xffff
; GFX11-NEXT: s_lshl_b32 s0, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_mov_b32 s8, s0
; GFX11-NEXT: s_mov_b32 s9, s0
; GFX11-NEXT: s_mov_b32 s10, s0
; GFX11-NEXT: s_mov_b32 s11, s0
; GFX11-NEXT: s_mov_b32 s12, s0
; GFX11-NEXT: s_mov_b32 s13, s0
; GFX11-NEXT: s_mov_b32 s14, s0
; GFX11-NEXT: s_mov_b32 s15, s0
; GFX11-NEXT: s_mov_b32 s16, s0
; GFX11-NEXT: s_mov_b32 s17, s0
; GFX11-NEXT: s_mov_b32 s18, s0
; GFX11-NEXT: s_mov_b32 s19, s0
; GFX11-NEXT: s_mov_b32 s20, s0
; GFX11-NEXT: s_mov_b32 s21, s0
; GFX11-NEXT: s_mov_b32 s22, s0
; GFX11-NEXT: s_mov_b32 s23, s0
; GFX11-NEXT: s_mov_b32 s24, s0
; GFX11-NEXT: s_mov_b32 s25, s0
; GFX11-NEXT: s_mov_b32 s26, s0
; GFX11-NEXT: s_mov_b32 s27, s0
; GFX11-NEXT: s_mov_b32 s28, s0
; GFX11-NEXT: s_mov_b32 s29, s0
; GFX11-NEXT: s_mov_b32 s30, s0
; GFX11-NEXT: s_mov_b32 s31, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s33, s33, s33
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21
; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23
; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
; GFX11-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX11-NEXT: s_lshl_b32 s0, s33, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 m0, s0, 3
; GFX11-NEXT: v_movrels_b32_e32 v34, v0
; GFX11-NEXT: s_add_i32 m0, s0, 2
; GFX11-NEXT: v_movrels_b32_e32 v33, v0
; GFX11-NEXT: s_mov_b32 m0, s0
; GFX11-NEXT: v_movrels_b32_e32 v32, v1
; GFX11-NEXT: v_movrels_b32_e32 v31, v0
; GFX11-NEXT: global_store_b128 v35, v[31:34], s[34:35]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v128i8_v16i8:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[36:38], s[4:5], 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 8
; GFX12-NEXT: v_mov_b32_e32 v35, 0
; GFX12-NEXT: s_and_b32 s1, s0, 0xff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b32 s0, s1, s0
; GFX12-NEXT: s_and_b32 s1, s0, 0xffff
; GFX12-NEXT: s_lshl_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b32 s0, s1, s0
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
; GFX12-NEXT: s_mov_b32 s4, s0
; GFX12-NEXT: s_mov_b32 s5, s0
; GFX12-NEXT: s_mov_b32 s6, s0
; GFX12-NEXT: s_mov_b32 s7, s0
; GFX12-NEXT: s_mov_b32 s8, s0
; GFX12-NEXT: s_mov_b32 s9, s0
; GFX12-NEXT: s_mov_b32 s10, s0
; GFX12-NEXT: s_mov_b32 s11, s0
; GFX12-NEXT: s_mov_b32 s12, s0
; GFX12-NEXT: s_mov_b32 s13, s0
; GFX12-NEXT: s_mov_b32 s14, s0
; GFX12-NEXT: s_mov_b32 s15, s0
; GFX12-NEXT: s_mov_b32 s16, s0
; GFX12-NEXT: s_mov_b32 s17, s0
; GFX12-NEXT: s_mov_b32 s18, s0
; GFX12-NEXT: s_mov_b32 s19, s0
; GFX12-NEXT: s_mov_b32 s20, s0
; GFX12-NEXT: s_mov_b32 s21, s0
; GFX12-NEXT: s_mov_b32 s22, s0
; GFX12-NEXT: s_mov_b32 s23, s0
; GFX12-NEXT: s_mov_b32 s24, s0
; GFX12-NEXT: s_mov_b32 s25, s0
; GFX12-NEXT: s_mov_b32 s26, s0
; GFX12-NEXT: s_mov_b32 s27, s0
; GFX12-NEXT: s_mov_b32 s28, s0
; GFX12-NEXT: s_mov_b32 s29, s0
; GFX12-NEXT: s_mov_b32 s30, s0
; GFX12-NEXT: s_mov_b32 s31, s0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s33, s38, s38
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21
; GFX12-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23
; GFX12-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
; GFX12-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
; GFX12-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
; GFX12-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX12-NEXT: s_lshl_b32 s0, s33, 1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 m0, s0, 3
; GFX12-NEXT: v_movrels_b32_e32 v34, v0
; GFX12-NEXT: s_add_co_i32 m0, s0, 2
; GFX12-NEXT: v_movrels_b32_e32 v33, v0
; GFX12-NEXT: s_mov_b32 m0, s0
; GFX12-NEXT: v_movrels_b32_e32 v32, v1
; GFX12-NEXT: v_movrels_b32_e32 v31, v0
; GFX12-NEXT: global_store_b128 v35, v[31:34], s[36:37]
; GFX12-NEXT: s_endpgm
entry:
%alloca = freeze <128 x i8> poison
%allocabc = bitcast <128 x i8> %alloca to <8 x i128>
%vec = extractelement <8 x i128> %allocabc, i32 %idx
%vecbc = bitcast i128 %vec to <16 x i8>
store <16 x i8> %vecbc, ptr addrspace(1) %out, align 16
ret void
}
define amdgpu_kernel void @test_bitcast_llc_v64i16_v8i16(ptr addrspace(1) %out, i32 %idx) {
; GFX9-LABEL: test_bitcast_llc_v64i16_v8i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_bitcast_llc_v64i16_v8i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v64i16_v8i16:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
%alloca = freeze <64 x i16> poison
%allocabc = bitcast <64 x i16> %alloca to <8 x i128>
%vec = extractelement <8 x i128> %allocabc, i32 %idx
%vecbc = bitcast i128 %vec to <8 x i16>
store <8 x i16> %vecbc, ptr addrspace(1) %out, align 16
ret void
}
define amdgpu_kernel void @test_bitcast_llc_v32i32_v4i32(ptr addrspace(1) %out, i32 %idx) {
; GFX9-LABEL: test_bitcast_llc_v32i32_v4i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_bitcast_llc_v32i32_v4i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v32i32_v4i32:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
%alloca = freeze <32 x i32> poison
%allocabc = bitcast <32 x i32> %alloca to <8 x i128>
%vec = extractelement <8 x i128> %allocabc, i32 %idx
%vecbc = bitcast i128 %vec to <4 x i32>
store <4 x i32> %vecbc, ptr addrspace(1) %out, align 16
ret void
}
define amdgpu_kernel void @test_bitcast_llc_v16i64_v4i256(ptr addrspace(1) %out, i32 %idx) {
; GFX9-LABEL: test_bitcast_llc_v16i64_v4i256:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_add_i32 s3, s2, 1
; GFX9-NEXT: s_add_i32 s3, s3, s3
; GFX9-NEXT: s_lshl_b32 s3, s3, 1
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s4, s3, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s5, s3, 2
; GFX9-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_set_gpr_idx_on s5, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v5, v1
; GFX9-NEXT: v_mov_b32_e32 v4, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v7, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v6, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_bitcast_llc_v16i64_v4i256:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s2, 1
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_add_i32 s0, s0, s0
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_lshl_b32 s3, s0, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_mov_b32 m0, s3
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s3, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s3, 2
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: v_movrels_b32_e32 v5, v1
; GFX11-NEXT: v_movrels_b32_e32 v4, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: v_movrels_b32_e32 v7, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_movrels_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v16i64_v4i256:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_add_co_i32 s3, s2, 1
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_add_co_i32 s3, s3, s3
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_lshl_b32 s3, s3, 1
; GFX12-NEXT: s_mov_b32 m0, s3
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s3, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s3, 2
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: v_movrels_b32_e32 v5, v1
; GFX12-NEXT: v_movrels_b32_e32 v4, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: v_movrels_b32_e32 v7, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: v_movrels_b32_e32 v6, v0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: s_endpgm
entry:
%alloca = freeze <16 x i64> poison
%allocabc = bitcast <16 x i64> %alloca to <4 x i256>
%vec = extractelement <4 x i256> %allocabc, i32 %idx
%vecbc = bitcast i256 %vec to <4 x i64>
store <4 x i64> %vecbc, ptr addrspace(1) %out, align 16
ret void
}