blob: 75c5d206e793398e876067f072c3a9df70eca371 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
define <3 x float> @extract_subvector_v3f32_v33f32_elt30_0(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:112 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v5
; GFX900-NEXT: v_mov_b32_e32 v1, v6
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:112 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
; GFX942-NEXT: s_setpc_b64 s[30:31]
%val = load volatile <33 x float>, ptr addrspace(1) %ptr, align 4
%extract.subvector = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
ret <3 x float> %extract.subvector
}
define <3 x float> @extract_subvector_v3f32_v33f32_elt30_1(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX900-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:112
; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128
; GFX900-NEXT: s_mov_b32 s4, 0
; GFX900-NEXT: s_mov_b32 s5, s4
; GFX900-NEXT: s_mov_b32 s6, s4
; GFX900-NEXT: s_mov_b32 s7, s4
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mov_b32_e32 v0, v9
; GFX900-NEXT: v_mov_b32_e32 v1, v10
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX942-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:112
; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128
; GFX942-NEXT: s_mov_b32 s0, 0
; GFX942-NEXT: s_mov_b32 s1, s0
; GFX942-NEXT: s_mov_b32 s2, s0
; GFX942-NEXT: s_mov_b32 s3, s0
; GFX942-NEXT: s_waitcnt vmcnt(2)
; GFX942-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GFX942-NEXT: s_waitcnt vmcnt(2)
; GFX942-NEXT: v_mov_b32_e32 v0, v10
; GFX942-NEXT: v_mov_b32_e32 v1, v11
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%val = load <33 x float>, ptr addrspace(1) %ptr, align 4
%val.slice.0 = shufflevector <33 x float> %val, <33 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
%val.slice.48 = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
ret <3 x float> %val.slice.48
}
define <6 x float> @extract_subvector_v6f32_v36f32_elt30(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: extract_subvector_v6f32_v36f32_elt30:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
; GFX900-NEXT: s_mov_b32 s4, 0
; GFX900-NEXT: s_mov_b32 s5, s4
; GFX900-NEXT: s_mov_b32 s6, s4
; GFX900-NEXT: s_mov_b32 s7, s4
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_mov_b32_e32 v0, v12
; GFX900-NEXT: v_mov_b32_e32 v1, v13
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: extract_subvector_v6f32_v36f32_elt30:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX942-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
; GFX942-NEXT: s_mov_b32 s0, 0
; GFX942-NEXT: s_mov_b32 s1, s0
; GFX942-NEXT: s_mov_b32 s2, s0
; GFX942-NEXT: s_mov_b32 s3, s0
; GFX942-NEXT: s_waitcnt vmcnt(2)
; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
; GFX942-NEXT: s_waitcnt vmcnt(2)
; GFX942-NEXT: v_mov_b32_e32 v0, v12
; GFX942-NEXT: v_mov_b32_e32 v1, v13
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%val = load <36 x float>, ptr addrspace(1) %ptr, align 4
%val.slice.0 = shufflevector <36 x float> %val, <36 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
%val.slice.1 = shufflevector <36 x float> %val, <36 x float> poison, <6 x i32> <i32 30, i32 31, i32 32, i32 33, i32 34, i32 35>
ret <6 x float> %val.slice.1
}
define <3 x float> @issue153808_vector_extract_assert(ptr addrspace(1) %ptr) #0 {
; GFX900-LABEL: issue153808_vector_extract_assert:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v4, v1
; GFX900-NEXT: v_mov_b32_e32 v3, v0
; GFX900-NEXT: global_load_dwordx4 v[5:8], v[3:4], off
; GFX900-NEXT: global_load_dwordx3 v[0:2], v[3:4], off offset:192
; GFX900-NEXT: s_mov_b32 s4, 0
; GFX900-NEXT: s_mov_b32 s5, s4
; GFX900-NEXT: s_mov_b32 s6, s4
; GFX900-NEXT: s_mov_b32 s7, s4
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: issue153808_vector_extract_assert:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX942-NEXT: global_load_dwordx3 v[2:4], v[0:1], off offset:192
; GFX942-NEXT: s_mov_b32 s0, 0
; GFX942-NEXT: s_mov_b32 s1, s0
; GFX942-NEXT: s_mov_b32 s2, s0
; GFX942-NEXT: s_mov_b32 s3, s0
; GFX942-NEXT: s_waitcnt vmcnt(1)
; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
; GFX942-NEXT: s_waitcnt vmcnt(1)
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%val = load <51 x float>, ptr addrspace(1) %ptr, align 4
%val.slice.0 = shufflevector <51 x float> %val, <51 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
%val.slice.48 = shufflevector <51 x float> %val, <51 x float> poison, <3 x i32> <i32 48, i32 49, i32 50>
ret <3 x float> %val.slice.48
}
declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #1
attributes #0 = { nounwind }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX9: {{.*}}