test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s

 ; Check that WQM is not triggered by the softwqm intrinsic alone.
 ;
 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Check that the softwqm intrinsic works correctly for integers.
 ;
 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test2:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %out = fadd float %src0, %src1
   %out.0 = bitcast float %out to i32
   %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
   %out.2 = bitcast i32 %out.1 to float
   ret float %out.2
 }

 ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
 ;
 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test_softwqm1:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s1
 ; CHECK-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    v_add_f32_e32 v0, v1, v1
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %temp = fadd float %src0, %src1
   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %out = fadd float %temp, %temp
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
 ;
 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test_softwqm2:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    s_mov_b64 s[2:3], exec
 ; CHECK-NEXT:    s_wqm_b64 exec, exec
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s1
 ; CHECK-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v1
 ; CHECK-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_wqm_b64 exec, exec
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 ; CHECK-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %temp = fadd float %src0, %src1
   %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %out = fadd float %temp, %temp
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
 ;
 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test_wwm1:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s0
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %temp = fadd float %src0, %src1
   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
   %out = fadd float %temp.0, %temp.0
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }

 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
 ;
 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 ; CHECK-LABEL: test_strict_wwm1:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s0
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %temp = fadd float %src0, %src1
   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
   %out = fadd float %temp.0, %temp.0
   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
   ret float %out.0
 }


 ; Check that softwqm on one case of branch does not trigger WQM for shader.
 ;
 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 ; CHECK-LABEL: test_control_flow_0:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; CHECK-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_2
 ; CHECK-NEXT:  ; %bb.1: ; %ELSE
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:  .LBB6_2: ; %Flow
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_4
 ; CHECK-NEXT:  ; %bb.3: ; %IF
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s12
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s13
 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
 ; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
 ; CHECK-NEXT:  .LBB6_4: ; %END
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE

 IF:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %out = fadd float %src0, %src1
   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
   br label %END

 ELSE:
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
   br label %END

 END:
   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
   ret float %r
 }

 ; Check that softwqm on one case of branch is treated as WQM in WQM shader.
 ;
 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 ; CHECK-LABEL: test_control_flow_1:
 ; CHECK:       ; %bb.0: ; %main_body
 ; CHECK-NEXT:    s_mov_b64 s[14:15], exec
 ; CHECK-NEXT:    s_wqm_b64 exec, exec
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[16:17], vcc
 ; CHECK-NEXT:    s_xor_b64 s[16:17], exec, s[16:17]
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %ELSE
 ; CHECK-NEXT:    image_sample v1, v0, s[0:7], s[8:11] dmask:0x1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[18:19], s[14:15]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 ; CHECK-NEXT:  .LBB7_2: ; %Flow
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[16:17]
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_4
 ; CHECK-NEXT:  ; %bb.3: ; %IF
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s12
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s13
 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
 ; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
 ; CHECK-NEXT:  .LBB7_4: ; %END
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    s_and_b64 exec, exec, s[14:15]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE

 IF:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %out = fadd float %src0, %src1
   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
   br label %END

 ELSE:
   %c.bc = bitcast i32 %c to float
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
   %tex0 = extractelement <4 x float> %tex, i32 0
   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
   %data.sample = extractelement <4 x float> %dtex, i32 0

   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
   br label %END

 END:
   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
   ret float %r
 }

 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare float @llvm.amdgcn.softwqm.f32(float) #3
 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
 declare float @llvm.amdgcn.wwm.f32(float) #3
 declare void @llvm.amdgcn.wqm.demote(i1) #1

 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 attributes #3 = { nounwind readnone }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=CHECK %s

	; Check that WQM is not triggered by the softwqm intrinsic alone.
	;
	define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test1:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: v_mov_b32_e32 v0, s0
	; CHECK-NEXT: v_mov_b32_e32 v1, s1
	; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
	; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%out = fadd float %src0, %src1
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Check that the softwqm intrinsic works correctly for integers.
	;
	define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test2:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: v_mov_b32_e32 v0, s0
	; CHECK-NEXT: v_mov_b32_e32 v1, s1
	; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v0, v0, v1
	; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%out = fadd float %src0, %src1
	%out.0 = bitcast float %out to i32
	%out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
	%out.2 = bitcast i32 %out.1 to float
	ret float %out.2
	}

	; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
	;
	define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test_softwqm1:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: v_mov_b32_e32 v0, s0
	; CHECK-NEXT: v_mov_b32_e32 v2, s1
	; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
	; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
	; CHECK-NEXT: v_add_f32_e32 v0, v1, v1
	; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%temp = fadd float %src0, %src1
	call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%out = fadd float %temp, %temp
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
	;
	define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test_softwqm2:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: s_mov_b64 s[2:3], exec
	; CHECK-NEXT: s_wqm_b64 exec, exec
	; CHECK-NEXT: v_mov_b32_e32 v0, s0
	; CHECK-NEXT: v_mov_b32_e32 v2, s1
	; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v1, v1, v2
	; CHECK-NEXT: v_mov_b32_e32 v2, v1
	; CHECK-NEXT: v_add_f32_e32 v1, v1, v1
	; CHECK-NEXT: s_and_b64 exec, exec, s[2:3]
	; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
	; CHECK-NEXT: s_wqm_b64 exec, exec
	; CHECK-NEXT: v_mov_b32_e32 v0, v1
	; CHECK-NEXT: s_and_b64 exec, exec, s[2:3]
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%temp = fadd float %src0, %src1
	%temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
	call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%out = fadd float %temp, %temp
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
	; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
	;
	define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test_wwm1:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
	; CHECK-NEXT: v_mov_b32_e32 v1, s0
	; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_mov_b64 exec, s[2:3]
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
	; CHECK-NEXT: v_mov_b32_e32 v1, s1
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v1, v2, v1
	; CHECK-NEXT: s_mov_b64 exec, s[2:3]
	; CHECK-NEXT: v_mov_b32_e32 v0, v1
	; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
	; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%temp = fadd float %src0, %src1
	%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
	%out = fadd float %temp.0, %temp.0
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}

	; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
	;
	define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
	; CHECK-LABEL: test_strict_wwm1:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
	; CHECK-NEXT: v_mov_b32_e32 v1, s0
	; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_mov_b64 exec, s[2:3]
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1
	; CHECK-NEXT: v_mov_b32_e32 v1, s1
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v1, v2, v1
	; CHECK-NEXT: s_mov_b64 exec, s[2:3]
	; CHECK-NEXT: v_mov_b32_e32 v0, v1
	; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
	; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%temp = fadd float %src0, %src1
	%temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
	%out = fadd float %temp.0, %temp.0
	%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
	ret float %out.0
	}


	; Check that softwqm on one case of branch does not trigger WQM for shader.
	;
	define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
	; CHECK-LABEL: test_control_flow_0:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
	; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
	; CHECK-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
	; CHECK-NEXT: s_cbranch_execz .LBB6_2
	; CHECK-NEXT: ; %bb.1: ; %ELSE
	; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
	; CHECK-NEXT: .LBB6_2: ; %Flow
	; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
	; CHECK-NEXT: s_cbranch_execz .LBB6_4
	; CHECK-NEXT: ; %bb.3: ; %IF
	; CHECK-NEXT: v_mov_b32_e32 v0, s12
	; CHECK-NEXT: v_mov_b32_e32 v1, s13
	; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v2, v0, v1
	; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
	; CHECK-NEXT: .LBB6_4: ; %END
	; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
	; CHECK-NEXT: v_mov_b32_e32 v0, v2
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %IF, label %ELSE

	IF:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%out = fadd float %src0, %src1
	%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
	br label %END

	ELSE:
	call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
	br label %END

	END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	ret float %r
	}

	; Check that softwqm on one case of branch is treated as WQM in WQM shader.
	;
	define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
	; CHECK-LABEL: test_control_flow_1:
	; CHECK: ; %bb.0: ; %main_body
	; CHECK-NEXT: s_mov_b64 s[14:15], exec
	; CHECK-NEXT: s_wqm_b64 exec, exec
	; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
	; CHECK-NEXT: s_and_saveexec_b64 s[16:17], vcc
	; CHECK-NEXT: s_xor_b64 s[16:17], exec, s[16:17]
	; CHECK-NEXT: s_cbranch_execz .LBB7_2
	; CHECK-NEXT: ; %bb.1: ; %ELSE
	; CHECK-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1
	; CHECK-NEXT: s_and_saveexec_b64 s[18:19], s[14:15]
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
	; CHECK-NEXT: s_mov_b64 exec, s[18:19]
	; CHECK-NEXT: .LBB7_2: ; %Flow
	; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[16:17]
	; CHECK-NEXT: s_cbranch_execz .LBB7_4
	; CHECK-NEXT: ; %bb.3: ; %IF
	; CHECK-NEXT: v_mov_b32_e32 v0, s12
	; CHECK-NEXT: v_mov_b32_e32 v1, s13
	; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
	; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: v_add_f32_e32 v2, v0, v1
	; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
	; CHECK-NEXT: .LBB7_4: ; %END
	; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
	; CHECK-NEXT: s_and_b64 exec, exec, s[14:15]
	; CHECK-NEXT: v_mov_b32_e32 v0, v2
	; CHECK-NEXT: s_waitcnt vmcnt(0)
	; CHECK-NEXT: ; return to shader part epilog
	main_body:
	%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %IF, label %ELSE

	IF:
	%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
	%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
	%out = fadd float %src0, %src1
	%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
	br label %END

	ELSE:
	%c.bc = bitcast i32 %c to float
	%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
	%tex0 = extractelement <4 x float> %tex, i32 0
	%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
	%data.sample = extractelement <4 x float> %dtex, i32 0

	call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
	br label %END

	END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	ret float %r
	}

	declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
	declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
	declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
	declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
	declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
	declare float @llvm.amdgcn.wqm.f32(float) #3
	declare float @llvm.amdgcn.softwqm.f32(float) #3
	declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
	declare float @llvm.amdgcn.strict.wwm.f32(float) #3
	declare float @llvm.amdgcn.wwm.f32(float) #3
	declare void @llvm.amdgcn.wqm.demote(i1) #1

	attributes #1 = { nounwind }
	attributes #2 = { nounwind readonly }
	attributes #3 = { nounwind readnone }