| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s |
| |
| define amdgpu_kernel void @memoryIntrinstic(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr) { |
| ; CHECK-LABEL: memoryIntrinstic: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_bitcmp0_b32 s1, 0 |
| ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 |
| ; CHECK-NEXT: ; %bb.1: ; %else |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192 |
| ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 |
| ; CHECK-NEXT: s_mov_b32 s3, 0x5040100 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s1 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s3 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_3 |
| ; CHECK-NEXT: s_branch .LBB0_4 |
| ; CHECK-NEXT: .LBB0_2: |
| ; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: .LBB0_3: ; %then |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s0 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 |
| ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 |
| ; CHECK-NEXT: .LBB0_4: ; %end |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s2 |
| ; CHECK-NEXT: ds_write_b64 v2, v[0:1] |
| ; CHECK-NEXT: s_endpgm |
| %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 |
| br i1 %cond, label %then, label %else |
| |
| then: |
| %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3> |
| br label %end |
| |
| else: |
| %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2> |
| br label %end |
| |
| end: |
| %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] |
| store <4 x half> %res, ptr addrspace(3) %outptr |
| ret void |
| } |
| |
| define amdgpu_kernel void @badIntrinsicUse(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, <4 x i32> %rsrc) { |
| ; CHECK-LABEL: badIntrinsicUse: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_and_b32 s1, s1, 1 |
| ; CHECK-NEXT: s_add_i32 s3, s0, 0x2000 |
| ; CHECK-NEXT: s_cmp_eq_u32 s1, 0 |
| ; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 |
| ; CHECK-NEXT: ; %bb.1: ; %else |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s3 |
| ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x7060302 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_3 |
| ; CHECK-NEXT: s_branch .LBB1_4 |
| ; CHECK-NEXT: .LBB1_2: |
| ; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: .LBB1_3: ; %then |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s3 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 |
| ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 |
| ; CHECK-NEXT: .LBB1_4: ; %end |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s2 |
| ; CHECK-NEXT: ds_write_b64 v2, v[0:1] |
| ; CHECK-NEXT: s_endpgm |
| %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 |
| br i1 %cond, label %then, label %else |
| |
| then: |
| %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3> |
| br label %end |
| |
| else: |
| %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| call void @llvm.amdgcn.raw.buffer.store(ptr addrspace(3) %gep0, <4 x i32> %rsrc, i32 0, i32 0, i32 0) |
| %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2> |
| br label %end |
| |
| end: |
| %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] |
| store <4 x half> %res, ptr addrspace(3) %outptr |
| ret void |
| } |
| |
| define amdgpu_kernel void @badIntrinsicUse2(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, ptr addrspace(3) %outptr1) { |
| ; CHECK-LABEL: badIntrinsicUse2: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_and_b32 s1, s1, 1 |
| ; CHECK-NEXT: s_add_i32 s4, s0, 0x2000 |
| ; CHECK-NEXT: s_cmp_eq_u32 s1, 0 |
| ; CHECK-NEXT: s_cbranch_scc0 .LBB2_2 |
| ; CHECK-NEXT: ; %bb.1: ; %else |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s4 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s3 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s4 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x7060302 |
| ; CHECK-NEXT: ds_write_b32 v0, v1 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(1) |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0 |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_3 |
| ; CHECK-NEXT: s_branch .LBB2_4 |
| ; CHECK-NEXT: .LBB2_2: |
| ; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: .LBB2_3: ; %then |
| ; CHECK-NEXT: v_mov_b32_e32 v0, s4 |
| ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 |
| ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 |
| ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 |
| ; CHECK-NEXT: .LBB2_4: ; %end |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s2 |
| ; CHECK-NEXT: ds_write_b64 v2, v[0:1] |
| ; CHECK-NEXT: s_endpgm |
| %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 |
| br i1 %cond, label %then, label %else |
| |
| then: |
| %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> <i32 0, i32 2, i32 1, i32 3> |
| br label %end |
| |
| else: |
| %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) |
| %gep1 = call ptr addrspace(3) @llvm.amdgcn.readfirstlane(ptr addrspace(3) %gep0) |
| store ptr addrspace(3) %gep1, ptr addrspace(3) %outptr1 |
| %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> <i32 1, i32 3, i32 0, i32 2> |
| br label %end |
| |
| end: |
| %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] |
| store <4 x half> %res, ptr addrspace(3) %outptr |
| ret void |
| } |