llvm/test/CodeGen/AMDGPU/bfe-i8-i16.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefix=GFX9
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s --check-prefix=GFX12-TRUE16

 ; Test that isel patterns recognize (and (lshr x, C), mask) for i8/i16
 ; and lower it to v_bfe_u32 when real true16 instructions are not used.

 define i16 @bfe_i16(i16 %a) {
 ; GFX9-LABEL: bfe_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 4, 4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: bfe_i16:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 4, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %shr = lshr i16 %a, 4
   %and = and i16 %shr, 15
   ret i16 %and
 }

 ; i8: 4 bits at offset 4.
 define i8 @bfe_i8(i8 %a) {
 ; GFX9-LABEL: bfe_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 4, 4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: bfe_i8:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 4, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %shr = lshr i8 %a, 4
   %and = and i8 %shr, 15
   ret i8 %and
 }

 ; Negative: multiple uses of the shifted value should not match the one-use
 ; narrow BFE pattern.
 define i16 @no_bfe_i16_multi_use(i16 %a) {
 ; GFX9-LABEL: no_bfe_i16_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 4, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: no_bfe_i16_multi_use:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 4, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b16 v0.h, v0.l, 15
 ; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
 ; GFX12-TRUE16-NEXT:    v_xor_b16 v0.l, v0.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %shr = lshr i16 %a, 4
   %and0 = and i16 %shr, 15
   %and1 = and i16 %shr, 3
   %xor = xor i16 %and0, %and1
   ret i16 %xor
 }

 ; Pure uniform (SGPR) case.
 define amdgpu_ps i16 @bfe_i16_uniform(i16 inreg %a) {
 ; GFX9-LABEL: bfe_i16_uniform:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x40004
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-TRUE16-LABEL: bfe_i16_uniform:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_bfe_u32 s0, s0, 0x40004
 ; GFX12-TRUE16-NEXT:    ; return to shader part epilog
   %shr = lshr i16 %a, 4
   %and = and i16 %shr, 15
   ret i16 %and
 }

 define amdgpu_ps i8 @bfe_i8_uniform(i8 inreg %a) {
 ; GFX9-LABEL: bfe_i8_uniform:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x40004
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-TRUE16-LABEL: bfe_i8_uniform:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_bfe_u32 s0, s0, 0x40004
 ; GFX12-TRUE16-NEXT:    ; return to shader part epilog
   %shr = lshr i8 %a, 4
   %and = and i8 %shr, 15
   ret i8 %and
 }

 ; Vector case: keep the packed shift/and lowering.
 define <2 x i16> @bfe_v2i16(<2 x i16> %a) {
 ; GFX9-LABEL: bfe_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-LABEL: bfe_v2i16:
 ; GFX12-TRUE16:       ; %bb.0:
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1]
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xf000f, v0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %shr = lshr <2 x i16> %a, <i16 4, i16 4>
   %and = and <2 x i16> %shr, <i16 15, i16 15>
   ret <2 x i16> %and
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
	; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s \| FileCheck %s --check-prefix=GFX9
	; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s \| FileCheck %s --check-prefix=GFX12-TRUE16

	; Test that isel patterns recognize (and (lshr x, C), mask) for i8/i16
	; and lower it to v_bfe_u32 when real true16 instructions are not used.

	define i16 @bfe_i16(i16 %a) {
	; GFX9-LABEL: bfe_i16:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_bfe_u32 v0, v0, 4, 4
	; GFX9-NEXT: s_setpc_b64 s[30:31]
	;
	; GFX12-TRUE16-LABEL: bfe_i16:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
	; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l
	; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
	; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15
	; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
	%shr = lshr i16 %a, 4
	%and = and i16 %shr, 15
	ret i16 %and
	}

	; i8: 4 bits at offset 4.
	define i8 @bfe_i8(i8 %a) {
	; GFX9-LABEL: bfe_i8:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_bfe_u32 v0, v0, 4, 4
	; GFX9-NEXT: s_setpc_b64 s[30:31]
	;
	; GFX12-TRUE16-LABEL: bfe_i8:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
	; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l
	; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
	; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15
	; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
	%shr = lshr i8 %a, 4
	%and = and i8 %shr, 15
	ret i8 %and
	}

	; Negative: multiple uses of the shifted value should not match the one-use
	; narrow BFE pattern.
	define i16 @no_bfe_i16_multi_use(i16 %a) {
	; GFX9-LABEL: no_bfe_i16_multi_use:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_lshrrev_b16_e32 v0, 4, v0
	; GFX9-NEXT: v_and_b32_e32 v1, 15, v0
	; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
	; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]
	;
	; GFX12-TRUE16-LABEL: no_bfe_i16_multi_use:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
	; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.l, 4, v0.l
	; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) \| instskip(SKIP_1) \| instid1(VALU_DEP_1)
	; GFX12-TRUE16-NEXT: v_and_b16 v0.h, v0.l, 15
	; GFX12-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
	; GFX12-TRUE16-NEXT: v_xor_b16 v0.l, v0.h, v0.l
	; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
	%shr = lshr i16 %a, 4
	%and0 = and i16 %shr, 15
	%and1 = and i16 %shr, 3
	%xor = xor i16 %and0, %and1
	ret i16 %xor
	}

	; Pure uniform (SGPR) case.
	define amdgpu_ps i16 @bfe_i16_uniform(i16 inreg %a) {
	; GFX9-LABEL: bfe_i16_uniform:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004
	; GFX9-NEXT: ; return to shader part epilog
	;
	; GFX12-TRUE16-LABEL: bfe_i16_uniform:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x40004
	; GFX12-TRUE16-NEXT: ; return to shader part epilog
	%shr = lshr i16 %a, 4
	%and = and i16 %shr, 15
	ret i16 %and
	}

	define amdgpu_ps i8 @bfe_i8_uniform(i8 inreg %a) {
	; GFX9-LABEL: bfe_i8_uniform:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_bfe_u32 s0, s0, 0x40004
	; GFX9-NEXT: ; return to shader part epilog
	;
	; GFX12-TRUE16-LABEL: bfe_i8_uniform:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_bfe_u32 s0, s0, 0x40004
	; GFX12-TRUE16-NEXT: ; return to shader part epilog
	%shr = lshr i8 %a, 4
	%and = and i8 %shr, 15
	ret i8 %and
	}

	; Vector case: keep the packed shift/and lowering.
	define <2 x i16> @bfe_v2i16(<2 x i16> %a) {
	; GFX9-LABEL: bfe_v2i16:
	; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1]
	; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]
	;
	; GFX12-TRUE16-LABEL: bfe_v2i16:
	; GFX12-TRUE16: ; %bb.0:
	; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
	; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
	; GFX12-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 4, v0 op_sel_hi:[0,1]
	; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
	; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xf000f, v0
	; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
	%shr = lshr <2 x i16> %a, <i16 4, i16 4>
	%and = and <2 x i16> %shr, <i16 15, i16 15>
	ret <2 x i16> %and
	}