llvm/test/CodeGen/AMDGPU/extract-vector-elt-binop-build-vector.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s

 ; Extracting both lanes from an OR of two build_vectors that share a
 ; variable operand at different lane positions.  The lane-1 result of
 ;   or(<v, -1>, <255, v>)
 ; is -1, not v.  A scalarisation bug could lose the -1 constant and
 ; reuse v for lane 1, turning the subtract into (v|255)-v instead of
 ; (v|255)-(-1).

 define amdgpu_ps i32 @extract_or_build_vectors_shared_operand(i32 %v) {
 ; GFX9-LABEL: extract_or_build_vectors_shared_operand:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extract_or_build_vectors_shared_operand:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
   %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
   %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
   %or = or <2 x i32> %va, %vb
   %e0 = extractelement <2 x i32> %or, i32 0
   %e1 = extractelement <2 x i32> %or, i32 1
   %sub = sub i32 %e0, %e1
   ret i32 %sub
 }

 ; Same idea but with AND.  Lane 0 of
 ;   and(<v, -1>, <255, v>)
 ; is v & 255, lane 1 is -1 & v = v.

 define amdgpu_ps i32 @extract_and_build_vectors_shared_operand(i32 %v) {
 ; GFX9-LABEL: extract_and_build_vectors_shared_operand:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_sub_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extract_and_build_vectors_shared_operand:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_sub_nc_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
   %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
   %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
   %a = and <2 x i32> %va, %vb
   %e0 = extractelement <2 x i32> %a, i32 0
   %e1 = extractelement <2 x i32> %a, i32 1
   %sub = sub i32 %e0, %e1
   ret i32 %sub
 }

 ; XOR variant.  Lane 1 of
 ;   xor(<v, -1>, <255, v>)
 ; is -1 ^ v = ~v.

 define amdgpu_ps i32 @extract_xor_build_vectors_shared_operand(i32 %v) {
 ; GFX9-LABEL: extract_xor_build_vectors_shared_operand:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: extract_xor_build_vectors_shared_operand:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v1, -1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
   %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
   %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
   %x = xor <2 x i32> %va, %vb
   %e0 = extractelement <2 x i32> %x, i32 0
   %e1 = extractelement <2 x i32> %x, i32 1
   %sub = sub i32 %e0, %e1
   ret i32 %sub
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s \| FileCheck -enable-var-scope -check-prefix=GFX9 %s
	; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s \| FileCheck -enable-var-scope -check-prefix=GFX10 %s

	; Extracting both lanes from an OR of two build_vectors that share a
	; variable operand at different lane positions. The lane-1 result of
	; or(<v, -1>, <255, v>)
	; is -1, not v. A scalarisation bug could lose the -1 constant and
	; reuse v for lane 1, turning the subtract into (v\|255)-v instead of
	; (v\|255)-(-1).

	define amdgpu_ps i32 @extract_or_build_vectors_shared_operand(i32 %v) {
	; GFX9-LABEL: extract_or_build_vectors_shared_operand:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: v_or_b32_e32 v0, 0xff, v0
	; GFX9-NEXT: v_subrev_u32_e32 v0, -1, v0
	; GFX9-NEXT: v_readfirstlane_b32 s0, v0
	; GFX9-NEXT: ; return to shader part epilog
	;
	; GFX10-LABEL: extract_or_build_vectors_shared_operand:
	; GFX10: ; %bb.0:
	; GFX10-NEXT: v_or_b32_e32 v0, 0xff, v0
	; GFX10-NEXT: v_subrev_nc_u32_e32 v0, -1, v0
	; GFX10-NEXT: v_readfirstlane_b32 s0, v0
	; GFX10-NEXT: ; return to shader part epilog
	%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
	%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
	%or = or <2 x i32> %va, %vb
	%e0 = extractelement <2 x i32> %or, i32 0
	%e1 = extractelement <2 x i32> %or, i32 1
	%sub = sub i32 %e0, %e1
	ret i32 %sub
	}

	; Same idea but with AND. Lane 0 of
	; and(<v, -1>, <255, v>)
	; is v & 255, lane 1 is -1 & v = v.

	define amdgpu_ps i32 @extract_and_build_vectors_shared_operand(i32 %v) {
	; GFX9-LABEL: extract_and_build_vectors_shared_operand:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: v_sub_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
	; GFX9-NEXT: v_readfirstlane_b32 s0, v0
	; GFX9-NEXT: ; return to shader part epilog
	;
	; GFX10-LABEL: extract_and_build_vectors_shared_operand:
	; GFX10: ; %bb.0:
	; GFX10-NEXT: v_sub_nc_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
	; GFX10-NEXT: v_readfirstlane_b32 s0, v0
	; GFX10-NEXT: ; return to shader part epilog
	%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
	%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
	%a = and <2 x i32> %va, %vb
	%e0 = extractelement <2 x i32> %a, i32 0
	%e1 = extractelement <2 x i32> %a, i32 1
	%sub = sub i32 %e0, %e1
	ret i32 %sub
	}

	; XOR variant. Lane 1 of
	; xor(<v, -1>, <255, v>)
	; is -1 ^ v = ~v.

	define amdgpu_ps i32 @extract_xor_build_vectors_shared_operand(i32 %v) {
	; GFX9-LABEL: extract_xor_build_vectors_shared_operand:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0
	; GFX9-NEXT: v_xor_b32_e32 v0, 0xff, v0
	; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
	; GFX9-NEXT: v_readfirstlane_b32 s0, v0
	; GFX9-NEXT: ; return to shader part epilog
	;
	; GFX10-LABEL: extract_xor_build_vectors_shared_operand:
	; GFX10: ; %bb.0:
	; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
	; GFX10-NEXT: v_xor_b32_e32 v0, 0xff, v0
	; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1
	; GFX10-NEXT: v_readfirstlane_b32 s0, v0
	; GFX10-NEXT: ; return to shader part epilog
	%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
	%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
	%x = xor <2 x i32> %va, %vb
	%e0 = extractelement <2 x i32> %x, i32 0
	%e1 = extractelement <2 x i32> %x, i32 1
	%sub = sub i32 %e0, %e1
	ret i32 %sub
	}