blob: 970c98afd2a050f25454e047e7816f017f4e73f3 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s
define amdgpu_kernel void @memoryIntrinstic(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr) {
; CHECK-LABEL: memoryIntrinstic:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp0_b32 s1, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %else
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192
; CHECK-NEXT: s_mov_b32 s1, 0x7060302
; CHECK-NEXT: s_mov_b32 s3, 0x5040100
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s1
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s3
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: s_branch .LBB0_4
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: .LBB0_3: ; %then
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192
; CHECK-NEXT: s_mov_b32 s0, 0x5040100
; CHECK-NEXT: s_mov_b32 s1, 0x7060302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1
; CHECK-NEXT: .LBB0_4: ; %end
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: ds_write_b64 v2, v[0:1]
; CHECK-NEXT: s_endpgm
%gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048
br i1 %cond, label %then, label %else
then:
%load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
%shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
br label %end
else:
%load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
%shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
br label %end
end:
%res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ]
store <4 x half> %res, ptr addrspace(3) %outptr
ret void
}
define amdgpu_kernel void @badIntrinsicUse(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, <4 x i32> %rsrc) {
; CHECK-LABEL: badIntrinsicUse:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_add_i32 s3, s0, 0x2000
; CHECK-NEXT: s_cmp_eq_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB1_2
; CHECK-NEXT: ; %bb.1: ; %else
; CHECK-NEXT: v_mov_b32_e32 v0, s3
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0
; CHECK-NEXT: s_mov_b32 s0, 0x7060302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0
; CHECK-NEXT: s_mov_b32 s0, 0x5040100
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0
; CHECK-NEXT: s_cbranch_execz .LBB1_3
; CHECK-NEXT: s_branch .LBB1_4
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: .LBB1_3: ; %then
; CHECK-NEXT: v_mov_b32_e32 v0, s3
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0
; CHECK-NEXT: s_mov_b32 s0, 0x5040100
; CHECK-NEXT: s_mov_b32 s1, 0x7060302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1
; CHECK-NEXT: .LBB1_4: ; %end
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: ds_write_b64 v2, v[0:1]
; CHECK-NEXT: s_endpgm
%gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048
br i1 %cond, label %then, label %else
then:
%load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
%shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
br label %end
else:
%load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
call void @llvm.amdgcn.raw.buffer.store(ptr addrspace(3) %gep0, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
br label %end
end:
%res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ]
store <4 x half> %res, ptr addrspace(3) %outptr
ret void
}
define amdgpu_kernel void @badIntrinsicUse2(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, ptr addrspace(3) %outptr1) {
; CHECK-LABEL: badIntrinsicUse2:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_add_i32 s4, s0, 0x2000
; CHECK-NEXT: s_cmp_eq_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB2_2
; CHECK-NEXT: ; %bb.1: ; %else
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0
; CHECK-NEXT: v_mov_b32_e32 v0, s3
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: s_mov_b32 s0, 0x7060302
; CHECK-NEXT: ds_write_b32 v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0
; CHECK-NEXT: s_mov_b32 s0, 0x5040100
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0
; CHECK-NEXT: s_cbranch_execz .LBB2_3
; CHECK-NEXT: s_branch .LBB2_4
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: .LBB2_3: ; %then
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0
; CHECK-NEXT: s_mov_b32 s0, 0x5040100
; CHECK-NEXT: s_mov_b32 s1, 0x7060302
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0
; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1
; CHECK-NEXT: .LBB2_4: ; %end
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: ds_write_b64 v2, v[0:1]
; CHECK-NEXT: s_endpgm
%gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048
br i1 %cond, label %then, label %else
then:
%load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
%shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
br label %end
else:
%load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0)
%gep1 = call ptr addrspace(3) @llvm.amdgcn.readfirstlane(ptr addrspace(3) %gep0)
store ptr addrspace(3) %gep1, ptr addrspace(3) %outptr1
%shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
br label %end
end:
%res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ]
store <4 x half> %res, ptr addrspace(3) %outptr
ret void
}