| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,SDAG %s |
| ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GISEL %s |
| ; RUN: llc -O0 -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -check-prefix=O0 %s |
| |
| ; Regression tests for v_bitop3_b32 truth table computation when LHS and RHS |
| ; of a boolean node share a common sub-expression. The BitOp3_Op algorithm |
| ; can decompose the shared sub-expression on one side and invalidate the |
| ; bit-pattern assigned to the other side. |
| |
| ; ((wi ^ x) & x) | mul, where x = (mul ^ C) & mul |
| ; x appears in both the XOR and the AND feeding the top OR. |
| define amdgpu_ps float @bitop3_xor_and_or(i32 %wi, i32 %mul) { |
| ; GCN-LABEL: bitop3_xor_and_or: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_mov_b32 s0, 0xaaaaaaaa |
| ; GCN-NEXT: v_xor_b32_e32 v2, 0xaaaaaaaa, v1 |
| ; GCN-NEXT: v_bitop3_b32 v3, v1, v1, s0 bitop3:0x48 |
| ; GCN-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x78 |
| ; GCN-NEXT: v_and_or_b32 v0, v0, v3, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_xor_and_or: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v2, v1 |
| ; O0-NEXT: s_mov_b32 s0, 0xaaaaaaaa |
| ; O0-NEXT: v_xor_b32_e64 v3, v2, s0 |
| ; O0-NEXT: v_bitop3_b32 v1, v2, v2, s0 bitop3:0x48 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v3, v2 bitop3:0x78 |
| ; O0-NEXT: v_and_or_b32 v0, v0, v1, v2 |
| ; O0-NEXT: ; return to shader part epilog |
| %xor = xor i32 %mul, -1431655766 |
| %x = and i32 %xor, %mul |
| %yxor = xor i32 %wi, %x |
| %and = and i32 %yxor, %x |
| %result = or i32 %and, %mul |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; (umax(masked, 2) & mix) & not, where mix = v ^ salt, not = ~mix |
| ; mix and not share the same sub-expression; AND of x and ~x is always 0. |
| define amdgpu_ps float @bitop3_umax_and_not(i32 %v, i32 %salt, i32 %shl) { |
| ; GCN-LABEL: bitop3_umax_and_not: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_umax_and_not: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_accvgpr_write_b32 a0, v2 ; Reload Reuse |
| ; O0-NEXT: v_mov_b32_e32 v3, v1 |
| ; O0-NEXT: v_mov_b32_e32 v2, v0 |
| ; O0-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse |
| ; O0-NEXT: v_xnor_b32_e64 v1, v2, v3 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x90 |
| ; O0-NEXT: s_mov_b32 s0, 2 |
| ; O0-NEXT: v_max_u32_e64 v0, v0, s0 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x60 |
| ; O0-NEXT: v_and_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %mix = xor i32 %v, %salt |
| %not = xor i32 %mix, -1 |
| %masked = and i32 %shl, %not |
| %umax = call i32 @llvm.umax.i32(i32 %masked, i32 2) |
| %and0 = and i32 %umax, %mix |
| %result = and i32 %and0, %not |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; (xor(select, mix) & select) >> 31, where select = umax(shl(mix,8), mix) |
| ; mix appears in both the umax and the xor feeding the AND. |
| define amdgpu_ps float @bitop3_umax_xor_and(i32 %v, i32 %salt) { |
| ; GCN-LABEL: bitop3_umax_xor_and: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v0 |
| ; GCN-NEXT: v_max_u32_e32 v1, v1, v0 |
| ; GCN-NEXT: v_bitop3_b32 v0, v1, v1, v0 bitop3:0x48 |
| ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_umax_xor_and: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_xor_b32_e64 v1, v0, v1 |
| ; O0-NEXT: s_mov_b32 s0, 8 |
| ; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v1 |
| ; O0-NEXT: v_max_u32_e64 v0, v0, v1 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v0, v1 bitop3:0x48 |
| ; O0-NEXT: s_mov_b32 s0, 31 |
| ; O0-NEXT: v_ashrrev_i32_e64 v0, s0, v0 |
| ; O0-NEXT: ; return to shader part epilog |
| %mix = xor i32 %v, %salt |
| %shl = shl i32 %mix, 8 |
| %sel = call i32 @llvm.umax.i32(i32 %shl, i32 %mix) |
| %xor = xor i32 %sel, %mix |
| %and = and i32 %xor, %sel |
| %result = ashr i32 %and, 31 |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; masked ^ (masked & 16), where masked = (tid ^ 16) & 18 |
| ; masked is shared between LHS and RHS of the top XOR. |
| define amdgpu_ps float @bitop3_xor_masked_and(i32 %tid) { |
| ; GCN-LABEL: bitop3_xor_masked_and: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_bitop3_b32 v1, v0, 18, 16 bitop3:0x48 |
| ; GCN-NEXT: v_bitop3_b32 v0, v0, 16, v0 bitop3:0xc |
| ; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_xor_masked_and: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v1, v0 |
| ; O0-NEXT: s_mov_b32 s1, 16 |
| ; O0-NEXT: v_xor_b32_e64 v0, v1, s1 |
| ; O0-NEXT: s_mov_b32 s0, 18 |
| ; O0-NEXT: v_mov_b32_e32 v2, s0 |
| ; O0-NEXT: v_bitop3_b32 v1, v1, s1, v2 bitop3:8 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v1, s0 bitop3:0x6c |
| ; O0-NEXT: ; return to shader part epilog |
| %a = xor i32 %tid, 16 |
| %masked = and i32 %a, 18 |
| %e = and i32 %masked, 16 |
| %r = xor i32 %masked, %e |
| %ret = bitcast i32 %r to float |
| ret float %ret |
| } |
| |
| ; or(and(mix, salt), 0x80000000) ^ and(mix, salt) |
| ; and is shared between the OR and the outer XOR. |
| define amdgpu_ps float @bitop3_highbit_or_xor(i32 %v, i32 %salt) { |
| ; SDAG-LABEL: bitop3_highbit_or_xor: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: s_brev_b32 s0, 1 |
| ; SDAG-NEXT: v_bitop3_b32 v2, v0, v1, v0 bitop3:0xc |
| ; SDAG-NEXT: v_bitop3_b32 v0, v0, s0, v1 bitop3:0xce |
| ; SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_highbit_or_xor: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc |
| ; GISEL-NEXT: v_or_b32_e32 v1, 0x80000000, v0 |
| ; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_highbit_or_xor: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v2, v1 |
| ; O0-NEXT: v_bitop3_b32 v1, v0, v2, v0 bitop3:0xc |
| ; O0-NEXT: s_mov_b32 s0, 0x80000000 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, s0, v2 bitop3:0xce |
| ; O0-NEXT: v_xor_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %mix = xor i32 %v, %salt |
| %and = and i32 %mix, %salt |
| %or = or i32 %and, -2147483648 |
| %xor = xor i32 %or, %and |
| %ret = bitcast i32 %xor to float |
| ret float %ret |
| } |
| |
| ; (or3 ^ and) & or3, where or3 = (wi+25) | and, and = wi | 3 |
| ; and is shared inside or3 and in the outer XOR. |
| define amdgpu_ps float @bitop3_or_xor_and(i32 %wi) { |
| ; GCN-LABEL: bitop3_or_xor_and: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_add_u32_e32 v1, 25, v0 |
| ; GCN-NEXT: v_bitop3_b32 v0, v1, v0, 3 bitop3:0x10 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_or_xor_and: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v2, v0 |
| ; O0-NEXT: s_mov_b32 s0, 25 |
| ; O0-NEXT: v_add_u32_e64 v0, v2, s0 |
| ; O0-NEXT: s_mov_b32 s0, 3 |
| ; O0-NEXT: v_or3_b32 v1, v2, s0, v0 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, s0 bitop3:0x10 |
| ; O0-NEXT: v_and_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %and = or i32 %wi, 3 |
| %and1 = add i32 %wi, 25 |
| %or3 = or i32 %and1, %and |
| %xor1 = xor i32 %or3, %and |
| %and3 = and i32 %xor1, %or3 |
| %ret = bitcast i32 %and3 to float |
| ret float %ret |
| } |
| |
| ; (xor | and) ^ xor, where xor = fshl(and,0,5) ^ and |
| ; and is shared across multiple levels of the bitwise tree. |
| define amdgpu_ps float @bitop3_fshl_or_xor(i32 %v) { |
| ; SDAG-LABEL: bitop3_fshl_or_xor: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_xor_b32_e32 v1, 0x9e000000, v0 |
| ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 24, v1 |
| ; SDAG-NEXT: v_and_b32_e32 v2, v1, v0 |
| ; SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v2 |
| ; SDAG-NEXT: v_bitop3_b32 v0, v3, v1, v0 bitop3:0x78 |
| ; SDAG-NEXT: v_or_b32_e32 v1, v3, v2 |
| ; SDAG-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_fshl_or_xor: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_xor_b32_e32 v1, 0x9e3779b9, v0 |
| ; GISEL-NEXT: v_ashrrev_i32_e32 v1, 24, v1 |
| ; GISEL-NEXT: v_and_b32_e32 v2, v1, v0 |
| ; GISEL-NEXT: v_alignbit_b32 v2, v2, 0, 27 |
| ; GISEL-NEXT: v_bitop3_b32 v3, v2, v1, v0 bitop3:0x78 |
| ; GISEL-NEXT: v_bitop3_b32 v0, v2, v1, v0 bitop3:0xf8 |
| ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v3 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_fshl_or_xor: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v3, v0 |
| ; O0-NEXT: s_mov_b32 s0, 0x9e3779b9 |
| ; O0-NEXT: v_xor_b32_e64 v0, v3, s0 |
| ; O0-NEXT: s_mov_b32 s0, 24 |
| ; O0-NEXT: v_ashrrev_i32_e64 v2, s0, v0 |
| ; O0-NEXT: v_and_b32_e64 v0, v2, v3 |
| ; O0-NEXT: s_mov_b32 s1, -5 |
| ; O0-NEXT: s_mov_b32 s0, 0 |
| ; O0-NEXT: v_mov_b32_e32 v1, s1 |
| ; O0-NEXT: v_alignbit_b32 v0, v0, s0, v1 |
| ; O0-NEXT: v_bitop3_b32 v1, v0, v2, v3 bitop3:0x78 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0xf8 |
| ; O0-NEXT: v_xor_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %mix = xor i32 %v, -1640531527 |
| %ashr = ashr i32 %mix, 24 |
| %and = and i32 %ashr, %v |
| %fshl = call i32 @llvm.fshl.i32(i32 %and, i32 0, i32 5) |
| %xor = xor i32 %fshl, %and |
| %or = or i32 %xor, %and |
| %xor1 = xor i32 %or, %xor |
| %ret = bitcast i32 %xor1 to float |
| ret float %ret |
| } |
| |
| ; (x ^ C) & x, where x = (wi ^ salt) & salt |
| ; x is shared between the XOR and the outer AND. |
| define amdgpu_ps float @bitop3_and_xor_constant(i32 %wi, i32 %salt) { |
| ; SDAG-LABEL: bitop3_and_xor_constant: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: s_mov_b32 s0, 0x79ad5691 |
| ; SDAG-NEXT: v_bitop3_b32 v2, v0, v1, v0 bitop3:0xc |
| ; SDAG-NEXT: v_bitop3_b32 v0, v0, s0, v1 bitop3:0xc6 |
| ; SDAG-NEXT: v_and_b32_e32 v0, v0, v2 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_and_xor_constant: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc |
| ; GISEL-NEXT: v_xor_b32_e32 v1, 0x79ad5691, v0 |
| ; GISEL-NEXT: v_and_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_and_xor_constant: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v2, v1 |
| ; O0-NEXT: v_bitop3_b32 v1, v0, v2, v0 bitop3:0xc |
| ; O0-NEXT: s_mov_b32 s0, 0x79ad5691 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, s0, v2 bitop3:0xc6 |
| ; O0-NEXT: v_and_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %mix = xor i32 %wi, %salt |
| %x = and i32 %mix, %salt |
| %xor = xor i32 %x, 2041403025 |
| %result = and i32 %xor, %x |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; (x & b) ^ x, where x = a ^ b, a = (wi+32)|9, b = (wi+32) ^ (32-wi) |
| ; x and b are both shared across the tree. |
| define amdgpu_ps float @bitop3_and_xor_identity(i32 %wi) { |
| ; SDAG-LABEL: bitop3_and_xor_identity: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_add_u32_e32 v1, 32, v0 |
| ; SDAG-NEXT: v_sub_u32_e32 v0, 32, v0 |
| ; SDAG-NEXT: v_bitop3_b32 v2, v1, v0, 9 bitop3:0xc6 |
| ; SDAG-NEXT: v_bitop3_b32 v0, v1, v0, 9 bitop3:4 |
| ; SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_and_xor_identity: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_add_u32_e32 v1, 32, v0 |
| ; GISEL-NEXT: v_sub_u32_e32 v0, 32, v0 |
| ; GISEL-NEXT: v_or_b32_e32 v2, 9, v1 |
| ; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: v_xor_b32_e32 v1, v2, v0 |
| ; GISEL-NEXT: v_bfi_b32 v0, v0, 0, v1 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_and_xor_identity: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v1, v0 |
| ; O0-NEXT: s_mov_b32 s0, 32 |
| ; O0-NEXT: v_add_u32_e64 v0, v1, s0 |
| ; O0-NEXT: v_sub_u32_e64 v2, s0, v1 |
| ; O0-NEXT: s_mov_b32 s0, 9 |
| ; O0-NEXT: v_bitop3_b32 v1, v0, v2, s0 bitop3:0xc6 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, s0 bitop3:4 |
| ; O0-NEXT: v_xor_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %same0 = add i32 %wi, 32 |
| %a = or i32 %same0, 9 |
| %same1 = sub i32 32, %wi |
| %b = xor i32 %same0, %same1 |
| %x = xor i32 %a, %b |
| %and = and i32 %x, %b |
| %result = xor i32 %and, %x |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; (umax(shl & ~mix, 2) & mix) & ~mix, where mix = v ^ salt, not = ~mix |
| ; mix and not share the same sub-expression; AND of x and ~x is always 0. |
| define amdgpu_ps float @bitop3_umax_and_not_v2(i32 %wi, i32 %v) { |
| ; GCN-LABEL: bitop3_umax_and_not_v2: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_umax_and_not_v2: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v2, v1 |
| ; O0-NEXT: s_mov_b32 s0, 0x9e3779b9 |
| ; O0-NEXT: v_mul_lo_u32 v3, v0, s0 |
| ; O0-NEXT: v_xnor_b32_e64 v1, v2, v3 |
| ; O0-NEXT: s_mov_b32 s0, 15 |
| ; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v0 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x90 |
| ; O0-NEXT: s_mov_b32 s0, 2 |
| ; O0-NEXT: v_max_u32_e64 v0, v0, s0 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v2, v3 bitop3:0x60 |
| ; O0-NEXT: v_and_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %salt = mul i32 %wi, -1640531527 |
| %mix = xor i32 %v, %salt |
| %not = xor i32 %mix, -1 |
| %shl = shl i32 %wi, 15 |
| %masked = and i32 %shl, %not |
| %umax = call i32 @llvm.umax.i32(i32 %masked, i32 2) |
| %and0 = and i32 %umax, %mix |
| %result = and i32 %and0, %not |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; select(y >s x, 0, y & x), where x = ~(fshl & mask), y = sum ^ x |
| ; x is shared across the XOR, AND, and signed compare feeding the select. |
| define amdgpu_ps float @bitop3_fshl_select_shared(i32 %n, i32 %wi, i32 %sum) { |
| ; SDAG-LABEL: bitop3_fshl_select_shared: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_alignbit_b32 v0, v0, v0, 18 |
| ; SDAG-NEXT: v_add_u32_e32 v1, 32, v1 |
| ; SDAG-NEXT: v_bitop3_b32 v3, v0, v1, v0 bitop3:0x3f |
| ; SDAG-NEXT: v_bitop3_b32 v0, v2, v0, v1 bitop3:0x87 |
| ; SDAG-NEXT: v_and_b32_e32 v1, v0, v3 |
| ; SDAG-NEXT: v_cmp_le_i32_e32 vcc, v0, v3 |
| ; SDAG-NEXT: s_nop 1 |
| ; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_fshl_select_shared: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_alignbit_b32 v0, v0, v0, 18 |
| ; GISEL-NEXT: v_add_u32_e32 v1, 32, v1 |
| ; GISEL-NEXT: v_and_b32_e32 v3, v0, v1 |
| ; GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f |
| ; GISEL-NEXT: v_xnor_b32_e32 v1, v3, v2 |
| ; GISEL-NEXT: v_and_b32_e32 v2, v1, v0 |
| ; GISEL-NEXT: v_cmp_gt_i32_e32 vcc, v1, v0 |
| ; GISEL-NEXT: s_nop 1 |
| ; GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_fshl_select_shared: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v3, v1 |
| ; O0-NEXT: s_mov_b32 s0, 18 |
| ; O0-NEXT: v_lshrrev_b32_e64 v4, s0, v0 |
| ; O0-NEXT: s_mov_b32 s0, 14 |
| ; O0-NEXT: v_lshlrev_b32_e64 v1, s0, v0 |
| ; O0-NEXT: v_or_b32_e64 v0, v1, v4 |
| ; O0-NEXT: s_mov_b32 s0, 32 |
| ; O0-NEXT: v_add_u32_e64 v3, v3, s0 |
| ; O0-NEXT: v_bitop3_b32 v1, v1, v3, v4 bitop3:0x37 |
| ; O0-NEXT: v_bitop3_b32 v0, v2, v0, v3 bitop3:0x87 |
| ; O0-NEXT: v_cmp_gt_i32_e64 s[0:1], v0, v1 |
| ; O0-NEXT: v_and_b32_e64 v0, v0, v1 |
| ; O0-NEXT: s_mov_b32 s2, 0 |
| ; O0-NEXT: v_mov_b32_e32 v1, s2 |
| ; O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] |
| ; O0-NEXT: ; return to shader part epilog |
| %fshl = call i32 @llvm.fshl.i32(i32 %n, i32 %n, i32 14) |
| %mask = add i32 %wi, 32 |
| %masked = and i32 %fshl, %mask |
| %x = xor i32 %masked, -1 |
| %y = xor i32 %sum, %x |
| %cmp = icmp sgt i32 %y, %x |
| %and = and i32 %y, %x |
| %result = select i1 %cmp, i32 0, i32 %and |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; (sub(C, n&1) | (n&1 ^ 1)), where n&1 is shared between the sub and the xor. |
| ; The bitop3 lowering must use n&1, not the full unmasked n. |
| define amdgpu_ps float @bitop3_sub_or_xor(i32 %n) { |
| ; SDAG-LABEL: bitop3_sub_or_xor: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: s_mov_b32 s0, 0xefffc001 |
| ; SDAG-NEXT: v_bitop3_b32 v1, v0, s0, 1 bitop3:0x6c |
| ; SDAG-NEXT: v_bitop3_b32 v0, v0, 1, v0 bitop3:0xc |
| ; SDAG-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_sub_or_xor: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_and_b32_e32 v1, 1, v0 |
| ; GISEL-NEXT: v_sub_u32_e32 v1, 0xefffc001, v1 |
| ; GISEL-NEXT: v_bitop3_b32 v0, v0, 1, v0 bitop3:0xc |
| ; GISEL-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_sub_or_xor: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v1, v0 |
| ; O0-NEXT: s_mov_b32 s0, 1 |
| ; O0-NEXT: v_and_b32_e64 v0, v1, s0 |
| ; O0-NEXT: s_mov_b32 s1, 0xefffc001 |
| ; O0-NEXT: v_sub_u32_e64 v0, s1, v0 |
| ; O0-NEXT: v_bitop3_b32 v0, v0, v1, s0 bitop3:0xf2 |
| ; O0-NEXT: ; return to shader part epilog |
| %mask = and i32 %n, 1 |
| %add = sub nuw nsw i32 -268451839, %mask |
| %z = xor i32 %mask, 1 |
| %result = or i32 %add, %z |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; Full chain: (base + z) | z, where z = zext(ctlz(shl(n,31)) != 0) |
| ; Pre-optimization form of the same pattern. |
| define amdgpu_ps float @bitop3_ctlz_shl_or(i32 %n) { |
| ; SDAG-LABEL: bitop3_ctlz_shl_or: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 31, v0 |
| ; SDAG-NEXT: v_ffbh_u32_e32 v0, v0 |
| ; SDAG-NEXT: v_min_u32_e32 v0, 32, v0 |
| ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 |
| ; SDAG-NEXT: s_nop 1 |
| ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc |
| ; SDAG-NEXT: v_or_b32_e32 v0, 0xefffc000, v0 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_ctlz_shl_or: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 31, v0 |
| ; GISEL-NEXT: v_ffbh_u32_e32 v0, v0 |
| ; GISEL-NEXT: v_min_u32_e32 v0, 32, v0 |
| ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 |
| ; GISEL-NEXT: s_nop 1 |
| ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc |
| ; GISEL-NEXT: v_add_u32_e32 v1, 0xefffc000, v0 |
| ; GISEL-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_ctlz_shl_or: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: s_mov_b32 s0, 31 |
| ; O0-NEXT: v_lshlrev_b32_e64 v0, s0, v0 |
| ; O0-NEXT: v_ffbh_u32_e64 v0, v0 |
| ; O0-NEXT: s_mov_b32 s0, 32 |
| ; O0-NEXT: v_min_u32_e64 v0, v0, s0 |
| ; O0-NEXT: s_mov_b32 s0, 0 |
| ; O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 |
| ; O0-NEXT: s_nop 1 |
| ; O0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] |
| ; O0-NEXT: s_mov_b32 s0, 0xefffc000 |
| ; O0-NEXT: v_add_u32_e64 v0, v1, s0 |
| ; O0-NEXT: v_or_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %shl.n = shl i32 %n, 31 |
| %ctlz = call i32 @llvm.ctlz.i32(i32 %shl.n, i1 false) |
| %nz = icmp ne i32 %ctlz, 0 |
| %z = zext i1 %nz to i32 |
| %base = shl i32 2147467263, 14 |
| %add = add i32 %base, %z |
| %result = or i32 %add, %z |
| %ret = bitcast i32 %result to float |
| ret float %ret |
| } |
| |
| ; ((b ^ t) | t) & ~t, where t = c & a |
| ; Shared sub-expression t appears in XOR, OR, and NOT. |
| ; The correct result is b & ~(c & a). |
| define amdgpu_ps float @bitop3_bxort_or_t_and_not_t(i32 %a, i32 %b, i32 %c) { |
| ; SDAG-LABEL: bitop3_bxort_or_t_and_not_t: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 |
| ; SDAG-NEXT: v_or_b32_e32 v1, v1, v0 |
| ; SDAG-NEXT: v_bfi_b32 v0, v0, 0, v1 |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_bxort_or_t_and_not_t: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_and_b32_e32 v3, v2, v0 |
| ; GISEL-NEXT: v_bitop3_b32 v0, v1, v2, v0 bitop3:0xf8 |
| ; GISEL-NEXT: v_bfi_b32 v0, v3, 0, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_bxort_or_t_and_not_t: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v3, v0 |
| ; O0-NEXT: v_and_b32_e64 v0, v2, v3 |
| ; O0-NEXT: v_bitop3_b32 v1, v1, v2, v3 bitop3:0xf8 |
| ; O0-NEXT: v_bfi_b32 v0, v0, 0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %t = and i32 %c, %a |
| %u = xor i32 %b, %t |
| %ut = or i32 %u, %t |
| %nt = xor i32 %t, -1 |
| %r = and i32 %ut, %nt |
| %ret = bitcast i32 %r to float |
| ret float %ret |
| } |
| |
| ; ((a & b) & (a | c)) ^ ((a & b) | (a | c)) |
| ; Two shared sub-expressions T1=a&b and T2=a|c each appear twice. |
| ; Simplifies to (a & b) ^ (a | c). |
| define amdgpu_ps float @bitop3_t1t2_and_xor_or(i32 %a, i32 %b, i32 %c) { |
| ; SDAG-LABEL: bitop3_t1t2_and_xor_or: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x5c |
| ; SDAG-NEXT: ; return to shader part epilog |
| ; |
| ; GISEL-LABEL: bitop3_t1t2_and_xor_or: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: v_and_b32_e32 v1, v0, v1 |
| ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GISEL-NEXT: v_xor_b32_e32 v0, v1, v0 |
| ; GISEL-NEXT: ; return to shader part epilog |
| ; |
| ; O0-LABEL: bitop3_t1t2_and_xor_or: |
| ; O0: ; %bb.0: |
| ; O0-NEXT: v_mov_b32_e32 v3, v1 |
| ; O0-NEXT: v_mov_b32_e32 v1, v0 |
| ; O0-NEXT: v_bitop3_b32 v0, v1, v2, v3 bitop3:0xa0 |
| ; O0-NEXT: v_bitop3_b32 v1, v1, v2, v3 bitop3:0xfc |
| ; O0-NEXT: v_xor_b32_e64 v0, v0, v1 |
| ; O0-NEXT: ; return to shader part epilog |
| %t1 = and i32 %a, %b |
| %t2 = or i32 %a, %c |
| %and = and i32 %t1, %t2 |
| %or = or i32 %t1, %t2 |
| %r = xor i32 %and, %or |
| %ret = bitcast i32 %r to float |
| ret float %ret |
| } |
| |
| declare i32 @llvm.umax.i32(i32, i32) |
| declare i32 @llvm.fshl.i32(i32, i32, i32) |
| declare i32 @llvm.ctlz.i32(i32, i1 immarg) |