llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck --check-prefix=GCN %s

 define amdgpu_kernel void @test_should_convert_to_v_readfirstlane_b32(float %fval, i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test_should_convert_to_v_readfirstlane_b32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, s0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    s_or_b32 s5, s0, s1
 ; GCN-NEXT:    s_and_b32 s6, s5, s2
 ; GCN-NEXT:    s_lshr_b32 s4, s6, 2
 ; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0 nt
 ; GCN-NEXT:    s_endpgm
 entry:
   %conv = fptoui float %fval to i32

   %shl = shl i32 %conv, 16
   %or = or i32 %shl, %arg1
   %and = and i32 %or, %arg2
   %shr = lshr i32 %and, 2

   %sgpr128_0 = insertelement <4 x i32> poison, i32 %shr, i32 0
   %sgpr128_1 = insertelement <4 x i32> %sgpr128_0, i32 %or, i32 1
   %sgpr128_2 = insertelement <4 x i32> %sgpr128_1, i32 %and, i32 2
   %sgpr128_3 = insertelement <4 x i32> %sgpr128_2, i32 %shr, i32 3

   call void @llvm.amdgcn.raw.buffer.store.i32(i32 %arg1, <4 x i32> %sgpr128_3, i32 0, i32 0, i32 2)

   ret void
 }

 declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0

 attributes #0 = { nounwind }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s \| FileCheck --check-prefix=GCN %s

	define amdgpu_kernel void @test_should_convert_to_v_readfirstlane_b32(float %fval, i32 %arg1, i32 %arg2) {
	; GCN-LABEL: test_should_convert_to_v_readfirstlane_b32:
	; GCN: ; %bb.0: ; %entry
	; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_cvt_u32_f32_e32 v0, s0
	; GCN-NEXT: s_nop 0
	; GCN-NEXT: v_readfirstlane_b32 s0, v0
	; GCN-NEXT: s_lshl_b32 s0, s0, 16
	; GCN-NEXT: s_or_b32 s5, s0, s1
	; GCN-NEXT: s_and_b32 s6, s5, s2
	; GCN-NEXT: s_lshr_b32 s4, s6, 2
	; GCN-NEXT: s_mov_b32 s7, s4
	; GCN-NEXT: v_mov_b32_e32 v0, s1
	; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 nt
	; GCN-NEXT: s_endpgm
	entry:
	%conv = fptoui float %fval to i32

	%shl = shl i32 %conv, 16
	%or = or i32 %shl, %arg1
	%and = and i32 %or, %arg2
	%shr = lshr i32 %and, 2

	%sgpr128_0 = insertelement <4 x i32> poison, i32 %shr, i32 0
	%sgpr128_1 = insertelement <4 x i32> %sgpr128_0, i32 %or, i32 1
	%sgpr128_2 = insertelement <4 x i32> %sgpr128_1, i32 %and, i32 2
	%sgpr128_3 = insertelement <4 x i32> %sgpr128_2, i32 %shr, i32 3

	call void @llvm.amdgcn.raw.buffer.store.i32(i32 %arg1, <4 x i32> %sgpr128_3, i32 0, i32 0, i32 2)

	ret void
	}

	declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0

	attributes #0 = { nounwind }