blob: 200be2ff856f8782967fee3c14af0208b0825a91 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
; Extracting both lanes from an OR of two build_vectors that share a
; variable operand at different lane positions. The lane-1 result of
; or(<v, -1>, <255, v>)
; is -1, not v. A scalarisation bug could lose the -1 constant and
; reuse v for lane 1, turning the subtract into (v|255)-v instead of
; (v|255)-(-1).
define amdgpu_ps i32 @extract_or_build_vectors_shared_operand(i32 %v) {
; GFX9-LABEL: extract_or_build_vectors_shared_operand:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_or_b32_e32 v0, 0xff, v0
; GFX9-NEXT: v_subrev_u32_e32 v0, -1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extract_or_build_vectors_shared_operand:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_or_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, -1, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
%or = or <2 x i32> %va, %vb
%e0 = extractelement <2 x i32> %or, i32 0
%e1 = extractelement <2 x i32> %or, i32 1
%sub = sub i32 %e0, %e1
ret i32 %sub
}
; Same idea but with AND. Lane 0 of
; and(<v, -1>, <255, v>)
; is v & 255, lane 1 is -1 & v = v.
define amdgpu_ps i32 @extract_and_build_vectors_shared_operand(i32 %v) {
; GFX9-LABEL: extract_and_build_vectors_shared_operand:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_sub_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extract_and_build_vectors_shared_operand:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_nc_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
%a = and <2 x i32> %va, %vb
%e0 = extractelement <2 x i32> %a, i32 0
%e1 = extractelement <2 x i32> %a, i32 1
%sub = sub i32 %e0, %e1
ret i32 %sub
}
; XOR variant. Lane 1 of
; xor(<v, -1>, <255, v>)
; is -1 ^ v = ~v.
define amdgpu_ps i32 @extract_xor_build_vectors_shared_operand(i32 %v) {
; GFX9-LABEL: extract_xor_build_vectors_shared_operand:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0
; GFX9-NEXT: v_xor_b32_e32 v0, 0xff, v0
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extract_xor_build_vectors_shared_operand:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
; GFX10-NEXT: v_xor_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0
%vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1
%x = xor <2 x i32> %va, %vb
%e0 = extractelement <2 x i32> %x, i32 0
%e1 = extractelement <2 x i32> %x, i32 1
%sub = sub i32 %e0, %e1
ret i32 %sub
}