| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX9 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s --check-prefix=GFX12-TRUE16 |
| |
| ; Test that isel patterns recognize (and (lshr x, C), mask) for i8/i16 |
| ; and lower it to v_bfe_u32 when real true16 instructions are not used. |
| |
| define i16 @bfe_i16(i16 %a) { |
| ; GFX9-LABEL: bfe_i16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v0, v0, 4, 4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-TRUE16-LABEL: bfe_i16: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l |
| ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 |
| ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| %shr = lshr i16 %a, 4 |
| %and = and i16 %shr, 15 |
| ret i16 %and |
| } |
| |
| ; i8: 4 bits at offset 4. |
| define i8 @bfe_i8(i8 %a) { |
| ; GFX9-LABEL: bfe_i8: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v0, v0, 4, 4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-TRUE16-LABEL: bfe_i8: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l |
| ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 |
| ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| %shr = lshr i8 %a, 4 |
| %and = and i8 %shr, 15 |
| ret i8 %and |
| } |
| |
| ; Negative: multiple uses of the shifted value should not match the one-use |
| ; narrow BFE pattern. |
| define i16 @no_bfe_i16_multi_use(i16 %a) { |
| ; GFX9-LABEL: no_bfe_i16_multi_use: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 4, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 |
| ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-TRUE16-LABEL: no_bfe_i16_multi_use: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l |
| ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX12-TRUE16-NEXT: v_and_b16 v0.h, v0.l, 15 |
| ; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 |
| ; GFX12-TRUE16-NEXT: v_xor_b16 v0.l, v0.h, v0.l |
| ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| %shr = lshr i16 %a, 4 |
| %and0 = and i16 %shr, 15 |
| %and1 = and i16 %shr, 3 |
| %xor = xor i16 %and0, %and1 |
| ret i16 %xor |
| } |
| |
| ; Pure uniform (SGPR) case. |
| define amdgpu_ps i16 @bfe_i16_uniform(i16 inreg %a) { |
| ; GFX9-LABEL: bfe_i16_uniform: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-TRUE16-LABEL: bfe_i16_uniform: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x40004 |
| ; GFX12-TRUE16-NEXT: ; return to shader part epilog |
| %shr = lshr i16 %a, 4 |
| %and = and i16 %shr, 15 |
| ret i16 %and |
| } |
| |
| define amdgpu_ps i8 @bfe_i8_uniform(i8 inreg %a) { |
| ; GFX9-LABEL: bfe_i8_uniform: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX12-TRUE16-LABEL: bfe_i8_uniform: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x40004 |
| ; GFX12-TRUE16-NEXT: ; return to shader part epilog |
| %shr = lshr i8 %a, 4 |
| %and = and i8 %shr, 15 |
| ret i8 %and |
| } |
| |
| ; Vector case: keep the packed shift/and lowering. |
| define <2 x i16> @bfe_v2i16(<2 x i16> %a) { |
| ; GFX9-LABEL: bfe_v2i16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1] |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-TRUE16-LABEL: bfe_v2i16: |
| ; GFX12-TRUE16: ; %bb.0: |
| ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1] |
| ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xf000f, v0 |
| ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| %shr = lshr <2 x i16> %a, <i16 4, i16 4> |
| %and = and <2 x i16> %shr, <i16 15, i16 15> |
| ret <2 x i16> %and |
| } |