test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll - llvm - Git at Google

 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s

 ; Check that WQM is not triggered by the softwqm intrinsic alone.
 ;
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK-NOT: s_wqm_b64 exec, exec
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Check that the softwqm intrinsic works correctly for integers.
 ;
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK-NOT: s_wqm_b64 exec, exec
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %out = fadd float %src0, %src1
   %out.0 = bitcast float %out to i32
   %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
   %out.2 = bitcast i32 %out.1 to float
   ret float %out.2
 }

 ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
 ;
 ;CHECK-LABEL: {{^}}test_softwqm1:
 ;CHECK-NOT: s_wqm_b64 exec, exec
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_store_dword
 ;CHECK-NOT; s_wqm_b64 exec, exec
 ;CHECK: v_add_f32_e32
 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %temp = fadd float %src0, %src1
   call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %out = fadd float %temp, %temp
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
 ;
 ;CHECK-LABEL: {{^}}test_softwqm2:
 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 ;CHECK: buffer_store_dword
 ;CHECK; s_wqm_b64 exec, exec
 ;CHECK: v_add_f32_e32
 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %temp = fadd float %src0, %src1
   %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
   call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %out = fadd float %temp, %temp
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
 ;
 ;CHECK-LABEL: {{^}}test_wwm1:
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_store_dword
 ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG]]
 ;CHECK-NOT: s_wqm_b64
 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %temp = fadd float %src0, %src1
   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
   %out = fadd float %temp.0, %temp.0
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Check that softwqm on one case of branch does not trigger WQM for shader.
 ;
 ;CHECK-LABEL: {{^}}test_control_flow_0:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NOT: s_wqm_b64 exec, exec
 ;CHECK: %ELSE
 ;CHECK: store
 ;CHECK: %IF
 ;CHECK: buffer_load
 ;CHECK: buffer_load
 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE

 IF:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %out = fadd float %src0, %src1
   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
   br label %END

 ELSE:
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END

 END:
   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
   ret float %r
 }

 ; Check that softwqm on one case of branch is treated as WQM in WQM shader.
 ;
 ;CHECK-LABEL: {{^}}test_control_flow_1:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: %ELSE
 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
 ;CHECK: store
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK-NOT: s_and_saveexec_b64
 ;CHECK-NOT: s_and_b64 exec
 ;CHECK: buffer_load
 ;CHECK: buffer_load
 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 main_body:
   %c.bc = bitcast i32 %c to float
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
   %tex0 = extractelement <4 x float> %tex, i32 0
   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
   %data.sample = extractelement <4 x float> %dtex, i32 0

   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE

 IF:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
   %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
   %out = fadd float %src0, %src1
   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
   br label %END

 ELSE:
   call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END

 END:
   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
   ret float %r
 }

 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare void @llvm.amdgcn.kill(i1) #1
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare float @llvm.amdgcn.softwqm.f32(float) #3
 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
 declare float @llvm.amdgcn.wwm.f32(float) #3

 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 attributes #3 = { nounwind readnone }
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=CHECK %s

	; Check that WQM is not triggered by the softwqm intrinsic alone.
	;
	;CHECK-LABEL: {{^}}test1:
	;CHECK-NOT: s_wqm_b64 exec, exec
	;CHECK: buffer_load_dword
	;CHECK: buffer_load_dword
	;CHECK: v_add_f32_e32
	define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%out = fadd float %src0, %src1
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Check that the softwqm intrinsic works correctly for integers.
	;
	;CHECK-LABEL: {{^}}test2:
	;CHECK-NOT: s_wqm_b64 exec, exec
	;CHECK: buffer_load_dword
	;CHECK: buffer_load_dword
	;CHECK: v_add_f32_e32
	define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%out = fadd float %src0, %src1
	%out.0 = bitcast float %out to i32
	%out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
	%out.2 = bitcast i32 %out.1 to float
	ret float %out.2
	}

	; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
	;
	;CHECK-LABEL: {{^}}test_softwqm1:
	;CHECK-NOT: s_wqm_b64 exec, exec
	;CHECK: buffer_load_dword
	;CHECK: buffer_load_dword
	;CHECK: buffer_store_dword
	;CHECK-NOT; s_wqm_b64 exec, exec
	;CHECK: v_add_f32_e32
	define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%temp = fadd float %src0, %src1
	call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%out = fadd float %temp, %temp
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
	;
	;CHECK-LABEL: {{^}}test_softwqm2:
	;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK: s_wqm_b64 exec, exec
	;CHECK: buffer_load_dword
	;CHECK: buffer_load_dword
	;CHECK: s_and_b64 exec, exec, [[ORIG]]
	;CHECK: buffer_store_dword
	;CHECK; s_wqm_b64 exec, exec
	;CHECK: v_add_f32_e32
	define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%temp = fadd float %src0, %src1
	%temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
	call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%out = fadd float %temp, %temp
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
	;
	;CHECK-LABEL: {{^}}test_wwm1:
	;CHECK: buffer_load_dword
	;CHECK: buffer_store_dword
	;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
	;CHECK: buffer_load_dword
	;CHECK: v_add_f32_e32
	;CHECK: s_mov_b64 exec, [[ORIG]]
	;CHECK-NOT: s_wqm_b64
	define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%temp = fadd float %src0, %src1
	%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
	%out = fadd float %temp.0, %temp.0
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Check that softwqm on one case of branch does not trigger WQM for shader.
	;
	;CHECK-LABEL: {{^}}test_control_flow_0:
	;CHECK-NEXT: ; %main_body
	;CHECK-NOT: s_wqm_b64 exec, exec
	;CHECK: %ELSE
	;CHECK: store
	;CHECK: %IF
	;CHECK: buffer_load
	;CHECK: buffer_load
	define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
	main_body:
	%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %IF, label %ELSE

	IF:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%out = fadd float %src0, %src1
	%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
	br label %END

	ELSE:
	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
	br label %END

	END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	ret float %r
	}

	; Check that softwqm on one case of branch is treated as WQM in WQM shader.
	;
	;CHECK-LABEL: {{^}}test_control_flow_1:
	;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: %ELSE
	;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
	;CHECK: store
	;CHECK: s_mov_b64 exec, [[SAVED]]
	;CHECK: %IF
	;CHECK-NOT: s_and_saveexec_b64
	;CHECK-NOT: s_and_b64 exec
	;CHECK: buffer_load
	;CHECK: buffer_load
	define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
	main_body:
	%c.bc = bitcast i32 %c to float
	%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
	%tex0 = extractelement <4 x float> %tex, i32 0
	%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
	%data.sample = extractelement <4 x float> %dtex, i32 0

	%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %IF, label %ELSE

	IF:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%out = fadd float %src0, %src1
	%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
	br label %END

	ELSE:
	call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
	br label %END

	END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	ret float %r
	}

	declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
	declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
	declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
	declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
	declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
	declare void @llvm.amdgcn.kill(i1) #1
	declare float @llvm.amdgcn.wqm.f32(float) #3
	declare float @llvm.amdgcn.softwqm.f32(float) #3
	declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
	declare float @llvm.amdgcn.wwm.f32(float) #3

	attributes #1 = { nounwind }
	attributes #2 = { nounwind readonly }
	attributes #3 = { nounwind readnone }