| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s |
| |
| ; Extracting both lanes from an OR of two build_vectors that share a |
| ; variable operand at different lane positions. The lane-1 result of |
| ; or(<v, -1>, <255, v>) |
| ; is -1, not v. A scalarisation bug could lose the -1 constant and |
| ; reuse v for lane 1, turning the subtract into (v|255)-v instead of |
| ; (v|255)-(-1). |
| |
| define amdgpu_ps i32 @extract_or_build_vectors_shared_operand(i32 %v) { |
| ; GFX9-LABEL: extract_or_build_vectors_shared_operand: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_or_b32_e32 v0, 0xff, v0 |
| ; GFX9-NEXT: v_subrev_u32_e32 v0, -1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: extract_or_build_vectors_shared_operand: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_or_b32_e32 v0, 0xff, v0 |
| ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, -1, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0 |
| %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1 |
| %or = or <2 x i32> %va, %vb |
| %e0 = extractelement <2 x i32> %or, i32 0 |
| %e1 = extractelement <2 x i32> %or, i32 1 |
| %sub = sub i32 %e0, %e1 |
| ret i32 %sub |
| } |
| |
| ; Same idea but with AND. Lane 0 of |
| ; and(<v, -1>, <255, v>) |
| ; is v & 255, lane 1 is -1 & v = v. |
| |
| define amdgpu_ps i32 @extract_and_build_vectors_shared_operand(i32 %v) { |
| ; GFX9-LABEL: extract_and_build_vectors_shared_operand: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_sub_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: extract_and_build_vectors_shared_operand: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_sub_nc_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0 |
| %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1 |
| %a = and <2 x i32> %va, %vb |
| %e0 = extractelement <2 x i32> %a, i32 0 |
| %e1 = extractelement <2 x i32> %a, i32 1 |
| %sub = sub i32 %e0, %e1 |
| ret i32 %sub |
| } |
| |
| ; XOR variant. Lane 1 of |
| ; xor(<v, -1>, <255, v>) |
| ; is -1 ^ v = ~v. |
| |
| define amdgpu_ps i32 @extract_xor_build_vectors_shared_operand(i32 %v) { |
| ; GFX9-LABEL: extract_xor_build_vectors_shared_operand: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 |
| ; GFX9-NEXT: v_xor_b32_e32 v0, 0xff, v0 |
| ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: extract_xor_build_vectors_shared_operand: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 |
| ; GFX10-NEXT: v_xor_b32_e32 v0, 0xff, v0 |
| ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| %va = insertelement <2 x i32> <i32 0, i32 -1>, i32 %v, i32 0 |
| %vb = insertelement <2 x i32> <i32 255, i32 0>, i32 %v, i32 1 |
| %x = xor <2 x i32> %va, %vb |
| %e0 = extractelement <2 x i32> %x, i32 0 |
| %e1 = extractelement <2 x i32> %x, i32 1 |
| %sub = sub i32 %e0, %e1 |
| ret i32 %sub |
| } |