blob: 38cbce699348da7f9704d4dcff04f03ed53984ff [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GISEL %s
; RUN: llc -O0 -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefix=O0 %s
; Regression tests for v_bitop3_b32 truth table computation when LHS and RHS
; of a boolean node share a common sub-expression. The BitOp3_Op algorithm
; can decompose the shared sub-expression on one side and invalidate the
; bit-pattern assigned to the other side.
; ((wi ^ x) & x) | mul, where x = (mul ^ C) & mul
; x appears in both the XOR and the AND feeding the top OR.
define amdgpu_ps float @bitop3_xor_and_or(i32 %wi, i32 %mul) {
; GCN-LABEL: bitop3_xor_and_or:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s0, 0xaaaaaaaa
; GCN-NEXT: v_xor_b32_e32 v2, 0xaaaaaaaa, v1
; GCN-NEXT: v_bitop3_b32 v3, v1, v1, s0 bitop3:0x48
; GCN-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x78
; GCN-NEXT: v_and_or_b32 v0, v0, v3, v1
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_xor_and_or:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v2, v1
; O0-NEXT: s_mov_b32 s0, 0xaaaaaaaa
; O0-NEXT: v_xor_b32_e64 v3, v2, s0
; O0-NEXT: v_bitop3_b32 v1, v2, v2, s0 bitop3:0x48
; O0-NEXT: v_bitop3_b32 v0, v0, v3, v2 bitop3:0x78
; O0-NEXT: v_and_or_b32 v0, v0, v1, v2
; O0-NEXT: ; return to shader part epilog
%xor = xor i32 %mul, -1431655766
%x = and i32 %xor, %mul
%yxor = xor i32 %wi, %x
%and = and i32 %yxor, %x
%result = or i32 %and, %mul
%ret = bitcast i32 %result to float
ret float %ret
}
; (umax(masked, 2) & mix) & not, where mix = v ^ salt, not = ~mix
; mix and not share the same sub-expression; AND of x and ~x is always 0.
define amdgpu_ps float @bitop3_umax_and_not(i32 %v, i32 %salt, i32 %shl) {
; GCN-LABEL: bitop3_umax_and_not:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_umax_and_not:
; O0: ; %bb.0:
; O0-NEXT: v_accvgpr_write_b32 a0, v2 ; Reload Reuse
; O0-NEXT: v_mov_b32_e32 v3, v1
; O0-NEXT: v_mov_b32_e32 v2, v0
; O0-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse
; O0-NEXT: v_xnor_b32_e64 v1, v2, v3
; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x90
; O0-NEXT: s_mov_b32 s0, 2
; O0-NEXT: v_max_u32_e64 v0, v0, s0
; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x60
; O0-NEXT: v_and_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%mix = xor i32 %v, %salt
%not = xor i32 %mix, -1
%masked = and i32 %shl, %not
%umax = call i32 @llvm.umax.i32(i32 %masked, i32 2)
%and0 = and i32 %umax, %mix
%result = and i32 %and0, %not
%ret = bitcast i32 %result to float
ret float %ret
}
; (xor(select, mix) & select) >> 31, where select = umax(shl(mix,8), mix)
; mix appears in both the umax and the xor feeding the AND.
define amdgpu_ps float @bitop3_umax_xor_and(i32 %v, i32 %salt) {
; GCN-LABEL: bitop3_umax_xor_and:
; GCN: ; %bb.0:
; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GCN-NEXT: v_max_u32_e32 v1, v1, v0
; GCN-NEXT: v_bitop3_b32 v0, v1, v1, v0 bitop3:0x48
; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_umax_xor_and:
; O0: ; %bb.0:
; O0-NEXT: v_xor_b32_e64 v1, v0, v1
; O0-NEXT: s_mov_b32 s0, 8
; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v1
; O0-NEXT: v_max_u32_e64 v0, v0, v1
; O0-NEXT: v_bitop3_b32 v0, v0, v0, v1 bitop3:0x48
; O0-NEXT: s_mov_b32 s0, 31
; O0-NEXT: v_ashrrev_i32_e64 v0, s0, v0
; O0-NEXT: ; return to shader part epilog
%mix = xor i32 %v, %salt
%shl = shl i32 %mix, 8
%sel = call i32 @llvm.umax.i32(i32 %shl, i32 %mix)
%xor = xor i32 %sel, %mix
%and = and i32 %xor, %sel
%result = ashr i32 %and, 31
%ret = bitcast i32 %result to float
ret float %ret
}
; masked ^ (masked & 16), where masked = (tid ^ 16) & 18
; masked is shared between LHS and RHS of the top XOR.
define amdgpu_ps float @bitop3_xor_masked_and(i32 %tid) {
; GCN-LABEL: bitop3_xor_masked_and:
; GCN: ; %bb.0:
; GCN-NEXT: v_bitop3_b32 v1, v0, 18, 16 bitop3:0x48
; GCN-NEXT: v_bitop3_b32 v0, v0, 16, v0 bitop3:0xc
; GCN-NEXT: v_xor_b32_e32 v0, v1, v0
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_xor_masked_and:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v1, v0
; O0-NEXT: s_mov_b32 s1, 16
; O0-NEXT: v_xor_b32_e64 v0, v1, s1
; O0-NEXT: s_mov_b32 s0, 18
; O0-NEXT: v_mov_b32_e32 v2, s0
; O0-NEXT: v_bitop3_b32 v1, v1, s1, v2 bitop3:8
; O0-NEXT: v_bitop3_b32 v0, v0, v1, s0 bitop3:0x6c
; O0-NEXT: ; return to shader part epilog
%a = xor i32 %tid, 16
%masked = and i32 %a, 18
%e = and i32 %masked, 16
%r = xor i32 %masked, %e
%ret = bitcast i32 %r to float
ret float %ret
}
; or(and(mix, salt), 0x80000000) ^ and(mix, salt)
; and is shared between the OR and the outer XOR.
define amdgpu_ps float @bitop3_highbit_or_xor(i32 %v, i32 %salt) {
; SDAG-LABEL: bitop3_highbit_or_xor:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_brev_b32 s0, 1
; SDAG-NEXT: v_bitop3_b32 v2, v0, v1, v0 bitop3:0xc
; SDAG-NEXT: v_bitop3_b32 v0, v0, s0, v1 bitop3:0xce
; SDAG-NEXT: v_xor_b32_e32 v0, v0, v2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_highbit_or_xor:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
; GISEL-NEXT: v_or_b32_e32 v1, 0x80000000, v0
; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_highbit_or_xor:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v2, v1
; O0-NEXT: v_bitop3_b32 v1, v0, v2, v0 bitop3:0xc
; O0-NEXT: s_mov_b32 s0, 0x80000000
; O0-NEXT: v_bitop3_b32 v0, v0, s0, v2 bitop3:0xce
; O0-NEXT: v_xor_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%mix = xor i32 %v, %salt
%and = and i32 %mix, %salt
%or = or i32 %and, -2147483648
%xor = xor i32 %or, %and
%ret = bitcast i32 %xor to float
ret float %ret
}
; (or3 ^ and) & or3, where or3 = (wi+25) | and, and = wi | 3
; and is shared inside or3 and in the outer XOR.
define amdgpu_ps float @bitop3_or_xor_and(i32 %wi) {
; GCN-LABEL: bitop3_or_xor_and:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_u32_e32 v1, 25, v0
; GCN-NEXT: v_bitop3_b32 v0, v1, v0, 3 bitop3:0x10
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_or_xor_and:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v2, v0
; O0-NEXT: s_mov_b32 s0, 25
; O0-NEXT: v_add_u32_e64 v0, v2, s0
; O0-NEXT: s_mov_b32 s0, 3
; O0-NEXT: v_or3_b32 v1, v2, s0, v0
; O0-NEXT: v_bitop3_b32 v0, v0, v2, s0 bitop3:0x10
; O0-NEXT: v_and_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%and = or i32 %wi, 3
%and1 = add i32 %wi, 25
%or3 = or i32 %and1, %and
%xor1 = xor i32 %or3, %and
%and3 = and i32 %xor1, %or3
%ret = bitcast i32 %and3 to float
ret float %ret
}
; (xor | and) ^ xor, where xor = fshl(and,0,5) ^ and
; and is shared across multiple levels of the bitwise tree.
define amdgpu_ps float @bitop3_fshl_or_xor(i32 %v) {
; SDAG-LABEL: bitop3_fshl_or_xor:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_xor_b32_e32 v1, 0x9e000000, v0
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 24, v1
; SDAG-NEXT: v_and_b32_e32 v2, v1, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v2
; SDAG-NEXT: v_bitop3_b32 v0, v3, v1, v0 bitop3:0x78
; SDAG-NEXT: v_or_b32_e32 v1, v3, v2
; SDAG-NEXT: v_xor_b32_e32 v0, v1, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_fshl_or_xor:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_xor_b32_e32 v1, 0x9e3779b9, v0
; GISEL-NEXT: v_ashrrev_i32_e32 v1, 24, v1
; GISEL-NEXT: v_and_b32_e32 v2, v1, v0
; GISEL-NEXT: v_alignbit_b32 v2, v2, 0, 27
; GISEL-NEXT: v_bitop3_b32 v3, v2, v1, v0 bitop3:0x78
; GISEL-NEXT: v_bitop3_b32 v0, v2, v1, v0 bitop3:0xf8
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v3
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_fshl_or_xor:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v3, v0
; O0-NEXT: s_mov_b32 s0, 0x9e3779b9
; O0-NEXT: v_xor_b32_e64 v0, v3, s0
; O0-NEXT: s_mov_b32 s0, 24
; O0-NEXT: v_ashrrev_i32_e64 v2, s0, v0
; O0-NEXT: v_and_b32_e64 v0, v2, v3
; O0-NEXT: s_mov_b32 s1, -5
; O0-NEXT: s_mov_b32 s0, 0
; O0-NEXT: v_mov_b32_e32 v1, s1
; O0-NEXT: v_alignbit_b32 v0, v0, s0, v1
; O0-NEXT: v_bitop3_b32 v1, v0, v2, v3 bitop3:0x78
; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0xf8
; O0-NEXT: v_xor_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%mix = xor i32 %v, -1640531527
%ashr = ashr i32 %mix, 24
%and = and i32 %ashr, %v
%fshl = call i32 @llvm.fshl.i32(i32 %and, i32 0, i32 5)
%xor = xor i32 %fshl, %and
%or = or i32 %xor, %and
%xor1 = xor i32 %or, %xor
%ret = bitcast i32 %xor1 to float
ret float %ret
}
; (x ^ C) & x, where x = (wi ^ salt) & salt
; x is shared between the XOR and the outer AND.
define amdgpu_ps float @bitop3_and_xor_constant(i32 %wi, i32 %salt) {
; SDAG-LABEL: bitop3_and_xor_constant:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_mov_b32 s0, 0x79ad5691
; SDAG-NEXT: v_bitop3_b32 v2, v0, v1, v0 bitop3:0xc
; SDAG-NEXT: v_bitop3_b32 v0, v0, s0, v1 bitop3:0xc6
; SDAG-NEXT: v_and_b32_e32 v0, v0, v2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_and_xor_constant:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
; GISEL-NEXT: v_xor_b32_e32 v1, 0x79ad5691, v0
; GISEL-NEXT: v_and_b32_e32 v0, v1, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_and_xor_constant:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v2, v1
; O0-NEXT: v_bitop3_b32 v1, v0, v2, v0 bitop3:0xc
; O0-NEXT: s_mov_b32 s0, 0x79ad5691
; O0-NEXT: v_bitop3_b32 v0, v0, s0, v2 bitop3:0xc6
; O0-NEXT: v_and_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%mix = xor i32 %wi, %salt
%x = and i32 %mix, %salt
%xor = xor i32 %x, 2041403025
%result = and i32 %xor, %x
%ret = bitcast i32 %result to float
ret float %ret
}
; (x & b) ^ x, where x = a ^ b, a = (wi+32)|9, b = (wi+32) ^ (32-wi)
; x and b are both shared across the tree.
define amdgpu_ps float @bitop3_and_xor_identity(i32 %wi) {
; SDAG-LABEL: bitop3_and_xor_identity:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_add_u32_e32 v1, 32, v0
; SDAG-NEXT: v_sub_u32_e32 v0, 32, v0
; SDAG-NEXT: v_bitop3_b32 v2, v1, v0, 9 bitop3:0xc6
; SDAG-NEXT: v_bitop3_b32 v0, v1, v0, 9 bitop3:4
; SDAG-NEXT: v_xor_b32_e32 v0, v0, v2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_and_xor_identity:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_add_u32_e32 v1, 32, v0
; GISEL-NEXT: v_sub_u32_e32 v0, 32, v0
; GISEL-NEXT: v_or_b32_e32 v2, 9, v1
; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0
; GISEL-NEXT: v_xor_b32_e32 v1, v2, v0
; GISEL-NEXT: v_bfi_b32 v0, v0, 0, v1
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_and_xor_identity:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v1, v0
; O0-NEXT: s_mov_b32 s0, 32
; O0-NEXT: v_add_u32_e64 v0, v1, s0
; O0-NEXT: v_sub_u32_e64 v2, s0, v1
; O0-NEXT: s_mov_b32 s0, 9
; O0-NEXT: v_bitop3_b32 v1, v0, v2, s0 bitop3:0xc6
; O0-NEXT: v_bitop3_b32 v0, v0, v2, s0 bitop3:4
; O0-NEXT: v_xor_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%same0 = add i32 %wi, 32
%a = or i32 %same0, 9
%same1 = sub i32 32, %wi
%b = xor i32 %same0, %same1
%x = xor i32 %a, %b
%and = and i32 %x, %b
%result = xor i32 %and, %x
%ret = bitcast i32 %result to float
ret float %ret
}
; (umax(shl & ~mix, 2) & mix) & ~mix, where mix = v ^ salt, not = ~mix
; mix and not share the same sub-expression; AND of x and ~x is always 0.
define amdgpu_ps float @bitop3_umax_and_not_v2(i32 %wi, i32 %v) {
; GCN-LABEL: bitop3_umax_and_not_v2:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_umax_and_not_v2:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v2, v1
; O0-NEXT: s_mov_b32 s0, 0x9e3779b9
; O0-NEXT: v_mul_lo_u32 v3, v0, s0
; O0-NEXT: v_xnor_b32_e64 v1, v2, v3
; O0-NEXT: s_mov_b32 s0, 15
; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v0
; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x90
; O0-NEXT: s_mov_b32 s0, 2
; O0-NEXT: v_max_u32_e64 v0, v0, s0
; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x60
; O0-NEXT: v_and_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%salt = mul i32 %wi, -1640531527
%mix = xor i32 %v, %salt
%not = xor i32 %mix, -1
%shl = shl i32 %wi, 15
%masked = and i32 %shl, %not
%umax = call i32 @llvm.umax.i32(i32 %masked, i32 2)
%and0 = and i32 %umax, %mix
%result = and i32 %and0, %not
%ret = bitcast i32 %result to float
ret float %ret
}
; select(y >s x, 0, y & x), where x = ~(fshl & mask), y = sum ^ x
; x is shared across the XOR, AND, and signed compare feeding the select.
define amdgpu_ps float @bitop3_fshl_select_shared(i32 %n, i32 %wi, i32 %sum) {
; SDAG-LABEL: bitop3_fshl_select_shared:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_alignbit_b32 v0, v0, v0, 18
; SDAG-NEXT: v_add_u32_e32 v1, 32, v1
; SDAG-NEXT: v_bitop3_b32 v3, v0, v1, v0 bitop3:0x3f
; SDAG-NEXT: v_bitop3_b32 v0, v2, v0, v1 bitop3:0x87
; SDAG-NEXT: v_and_b32_e32 v1, v0, v3
; SDAG-NEXT: v_cmp_le_i32_e32 vcc, v0, v3
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_fshl_select_shared:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_alignbit_b32 v0, v0, v0, 18
; GISEL-NEXT: v_add_u32_e32 v1, 32, v1
; GISEL-NEXT: v_and_b32_e32 v3, v0, v1
; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
; GISEL-NEXT: v_xnor_b32_e32 v1, v3, v2
; GISEL-NEXT: v_and_b32_e32 v2, v1, v0
; GISEL-NEXT: v_cmp_gt_i32_e32 vcc, v1, v0
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_fshl_select_shared:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v3, v1
; O0-NEXT: s_mov_b32 s0, 18
; O0-NEXT: v_lshrrev_b32_e64 v4, s0, v0
; O0-NEXT: s_mov_b32 s0, 14
; O0-NEXT: v_lshlrev_b32_e64 v1, s0, v0
; O0-NEXT: v_or_b32_e64 v0, v1, v4
; O0-NEXT: s_mov_b32 s0, 32
; O0-NEXT: v_add_u32_e64 v3, v3, s0
; O0-NEXT: v_bitop3_b32 v1, v1, v3, v4 bitop3:0x37
; O0-NEXT: v_bitop3_b32 v0, v2, v0, v3 bitop3:0x87
; O0-NEXT: v_cmp_gt_i32_e64 s[0:1], v0, v1
; O0-NEXT: v_and_b32_e64 v0, v0, v1
; O0-NEXT: s_mov_b32 s2, 0
; O0-NEXT: v_mov_b32_e32 v1, s2
; O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; O0-NEXT: ; return to shader part epilog
%fshl = call i32 @llvm.fshl.i32(i32 %n, i32 %n, i32 14)
%mask = add i32 %wi, 32
%masked = and i32 %fshl, %mask
%x = xor i32 %masked, -1
%y = xor i32 %sum, %x
%cmp = icmp sgt i32 %y, %x
%and = and i32 %y, %x
%result = select i1 %cmp, i32 0, i32 %and
%ret = bitcast i32 %result to float
ret float %ret
}
; (sub(C, n&1) | (n&1 ^ 1)), where n&1 is shared between the sub and the xor.
; The bitop3 lowering must use n&1, not the full unmasked n.
define amdgpu_ps float @bitop3_sub_or_xor(i32 %n) {
; SDAG-LABEL: bitop3_sub_or_xor:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_mov_b32 s0, 0xefffc001
; SDAG-NEXT: v_bitop3_b32 v1, v0, s0, 1 bitop3:0x6c
; SDAG-NEXT: v_bitop3_b32 v0, v0, 1, v0 bitop3:0xc
; SDAG-NEXT: v_or_b32_e32 v0, v1, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_sub_or_xor:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_and_b32_e32 v1, 1, v0
; GISEL-NEXT: v_sub_u32_e32 v1, 0xefffc001, v1
; GISEL-NEXT: v_bitop3_b32 v0, v0, 1, v0 bitop3:0xc
; GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_sub_or_xor:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v1, v0
; O0-NEXT: s_mov_b32 s0, 1
; O0-NEXT: v_and_b32_e64 v0, v1, s0
; O0-NEXT: s_mov_b32 s1, 0xefffc001
; O0-NEXT: v_sub_u32_e64 v0, s1, v0
; O0-NEXT: v_bitop3_b32 v0, v0, v1, s0 bitop3:0xf2
; O0-NEXT: ; return to shader part epilog
%mask = and i32 %n, 1
%add = sub nuw nsw i32 -268451839, %mask
%z = xor i32 %mask, 1
%result = or i32 %add, %z
%ret = bitcast i32 %result to float
ret float %ret
}
; Full chain: (base + z) | z, where z = zext(ctlz(shl(n,31)) != 0)
; Pre-optimization form of the same pattern.
define amdgpu_ps float @bitop3_ctlz_shl_or(i32 %n) {
; SDAG-LABEL: bitop3_ctlz_shl_or:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 31, v0
; SDAG-NEXT: v_ffbh_u32_e32 v0, v0
; SDAG-NEXT: v_min_u32_e32 v0, 32, v0
; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_or_b32_e32 v0, 0xefffc000, v0
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_ctlz_shl_or:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 31, v0
; GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GISEL-NEXT: v_min_u32_e32 v0, 32, v0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_u32_e32 v1, 0xefffc000, v0
; GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_ctlz_shl_or:
; O0: ; %bb.0:
; O0-NEXT: s_mov_b32 s0, 31
; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v0
; O0-NEXT: v_ffbh_u32_e64 v0, v0
; O0-NEXT: s_mov_b32 s0, 32
; O0-NEXT: v_min_u32_e64 v0, v0, s0
; O0-NEXT: s_mov_b32 s0, 0
; O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; O0-NEXT: s_nop 1
; O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; O0-NEXT: s_mov_b32 s0, 0xefffc000
; O0-NEXT: v_add_u32_e64 v0, v1, s0
; O0-NEXT: v_or_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%shl.n = shl i32 %n, 31
%ctlz = call i32 @llvm.ctlz.i32(i32 %shl.n, i1 false)
%nz = icmp ne i32 %ctlz, 0
%z = zext i1 %nz to i32
%base = shl i32 2147467263, 14
%add = add i32 %base, %z
%result = or i32 %add, %z
%ret = bitcast i32 %result to float
ret float %ret
}
; ((b ^ t) | t) & ~t, where t = c & a
; Shared sub-expression t appears in XOR, OR, and NOT.
; The correct result is b & ~(c & a).
define amdgpu_ps float @bitop3_bxort_or_t_and_not_t(i32 %a, i32 %b, i32 %c) {
; SDAG-LABEL: bitop3_bxort_or_t_and_not_t:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_and_b32_e32 v0, v2, v0
; SDAG-NEXT: v_or_b32_e32 v1, v1, v0
; SDAG-NEXT: v_bfi_b32 v0, v0, 0, v1
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_bxort_or_t_and_not_t:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_and_b32_e32 v3, v2, v0
; GISEL-NEXT: v_bitop3_b32 v0, v1, v2, v0 bitop3:0xf8
; GISEL-NEXT: v_bfi_b32 v0, v3, 0, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_bxort_or_t_and_not_t:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v3, v0
; O0-NEXT: v_and_b32_e64 v0, v2, v3
; O0-NEXT: v_bitop3_b32 v1, v1, v2, v3 bitop3:0xf8
; O0-NEXT: v_bfi_b32 v0, v0, 0, v1
; O0-NEXT: ; return to shader part epilog
%t = and i32 %c, %a
%u = xor i32 %b, %t
%ut = or i32 %u, %t
%nt = xor i32 %t, -1
%r = and i32 %ut, %nt
%ret = bitcast i32 %r to float
ret float %ret
}
; ((a & b) & (a | c)) ^ ((a & b) | (a | c))
; Two shared sub-expressions T1=a&b and T2=a|c each appear twice.
; Simplifies to (a & b) ^ (a | c).
define amdgpu_ps float @bitop3_t1t2_and_xor_or(i32 %a, i32 %b, i32 %c) {
; SDAG-LABEL: bitop3_t1t2_and_xor_or:
; SDAG: ; %bb.0:
; SDAG-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x5c
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: bitop3_t1t2_and_xor_or:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_and_b32_e32 v1, v0, v1
; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0
; GISEL-NEXT: ; return to shader part epilog
;
; O0-LABEL: bitop3_t1t2_and_xor_or:
; O0: ; %bb.0:
; O0-NEXT: v_mov_b32_e32 v3, v1
; O0-NEXT: v_mov_b32_e32 v1, v0
; O0-NEXT: v_bitop3_b32 v0, v1, v2, v3 bitop3:0xa0
; O0-NEXT: v_bitop3_b32 v1, v1, v2, v3 bitop3:0xfc
; O0-NEXT: v_xor_b32_e64 v0, v0, v1
; O0-NEXT: ; return to shader part epilog
%t1 = and i32 %a, %b
%t2 = or i32 %a, %c
%and = and i32 %t1, %t2
%or = or i32 %t1, %t2
%r = xor i32 %and, %or
%ret = bitcast i32 %r to float
ret float %ret
}
declare i32 @llvm.umax.i32(i32, i32)
declare i32 @llvm.fshl.i32(i32, i32, i32)
declare i32 @llvm.ctlz.i32(i32, i1 immarg)