| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX10 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX12 |
| |
| define void @fcmp_f16_uniform(half inreg %a, half inreg %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f16_uniform: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f16_e64 s4, s16, s17 |
| ; GFX10-NEXT: v_cmp_gt_f16_e64 s5, s16, s17 |
| ; GFX10-NEXT: v_cmp_ge_f16_e64 s6, s16, s17 |
| ; GFX10-NEXT: v_cmp_lt_f16_e64 s7, s16, s17 |
| ; GFX10-NEXT: v_cmp_le_f16_e64 s8, s16, s17 |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: v_cmp_lg_f16_e64 s9, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: v_cmp_o_f16_e64 s10, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: v_cmp_nlg_f16_e64 s11, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: v_cmp_nle_f16_e64 s12, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: v_cmp_nlt_f16_e64 s13, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: v_cmp_nge_f16_e64 s14, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: v_cmp_ngt_f16_e64 s15, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: v_cmp_neq_f16_e64 s18, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: v_cmp_u_f16_e64 s16, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s5 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s6 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s7 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s8 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s9 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s10 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s11 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s12 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s13 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s14 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s15 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s17 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s16 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX10-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f16_uniform: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_gt_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_ge_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lt_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_le_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_o_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_nlg_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_nle_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_nlt_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_nge_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_ngt_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_neq_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX12-NEXT: s_cmp_u_f16 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s1, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s3, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s2 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s3 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s6 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s7 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s8 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s10 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s11 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s12 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s13 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: flat_store_b32 v[0:1], v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq half %a, %b |
| %ogt_result = fcmp ogt half %a, %b |
| %oge_result = fcmp oge half %a, %b |
| %olt_result = fcmp olt half %a, %b |
| %ole_result = fcmp ole half %a, %b |
| %one_result = fcmp one half %a, %b |
| %ord_result = fcmp ord half %a, %b |
| %ueq_result = fcmp ueq half %a, %b |
| %ugt_result = fcmp ugt half %a, %b |
| %uge_result = fcmp uge half %a, %b |
| %ult_result = fcmp ult half %a, %b |
| %ule_result = fcmp ule half %a, %b |
| %une_result = fcmp une half %a, %b |
| %uno_result = fcmp uno half %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |
| |
| define void @fcmp_f16_divergent(half %a, half %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f16_divergent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v1, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_add3_u32 v0, v1, v5, v0 |
| ; GFX10-NEXT: flat_store_dword v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f16_divergent: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v5 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: v_add3_u32 v1, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_add3_u32 v0, v1, v5, v0 |
| ; GFX12-NEXT: flat_store_b32 v[2:3], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq half %a, %b |
| %ogt_result = fcmp ogt half %a, %b |
| %oge_result = fcmp oge half %a, %b |
| %olt_result = fcmp olt half %a, %b |
| %ole_result = fcmp ole half %a, %b |
| %one_result = fcmp one half %a, %b |
| %ord_result = fcmp ord half %a, %b |
| %ueq_result = fcmp ueq half %a, %b |
| %ugt_result = fcmp ugt half %a, %b |
| %uge_result = fcmp uge half %a, %b |
| %ult_result = fcmp ult half %a, %b |
| %ule_result = fcmp ule half %a, %b |
| %une_result = fcmp une half %a, %b |
| %uno_result = fcmp uno half %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |
| |
| define void @fcmp_f32_uniform(float inreg %a, float inreg %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f32_uniform: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f32_e64 s4, s16, s17 |
| ; GFX10-NEXT: v_cmp_gt_f32_e64 s5, s16, s17 |
| ; GFX10-NEXT: v_cmp_ge_f32_e64 s6, s16, s17 |
| ; GFX10-NEXT: v_cmp_lt_f32_e64 s7, s16, s17 |
| ; GFX10-NEXT: v_cmp_le_f32_e64 s8, s16, s17 |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: v_cmp_lg_f32_e64 s9, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: v_cmp_o_f32_e64 s10, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 s11, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: v_cmp_nle_f32_e64 s12, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: v_cmp_nlt_f32_e64 s13, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: v_cmp_nge_f32_e64 s14, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: v_cmp_ngt_f32_e64 s15, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: v_cmp_neq_f32_e64 s18, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s16, s16, s17 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s5 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s6 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s7 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s8 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s9 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s10 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s11 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s12 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s13 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s14 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s15 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s17 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s16 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX10-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f32_uniform: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_gt_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_ge_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lt_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_le_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_o_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_nle_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_nlt_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_nge_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_ngt_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_neq_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX12-NEXT: s_cmp_u_f32 s0, s1 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s1, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s3, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s2 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s3 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s6 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s7 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s8 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s10 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s11 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s12 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s13 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: flat_store_b32 v[0:1], v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq float %a, %b |
| %ogt_result = fcmp ogt float %a, %b |
| %oge_result = fcmp oge float %a, %b |
| %olt_result = fcmp olt float %a, %b |
| %ole_result = fcmp ole float %a, %b |
| %one_result = fcmp one float %a, %b |
| %ord_result = fcmp ord float %a, %b |
| %ueq_result = fcmp ueq float %a, %b |
| %ugt_result = fcmp ugt float %a, %b |
| %uge_result = fcmp uge float %a, %b |
| %ult_result = fcmp ult float %a, %b |
| %ule_result = fcmp ule float %a, %b |
| %une_result = fcmp une float %a, %b |
| %uno_result = fcmp uno float %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |
| |
| define void @fcmp_f32_divergent(float %a, float %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f32_divergent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 |
| ; GFX10-NEXT: v_add3_u32 v1, v4, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_add3_u32 v0, v1, v5, v0 |
| ; GFX10-NEXT: flat_store_dword v[2:3], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f32_divergent: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v5 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 |
| ; GFX12-NEXT: v_add3_u32 v1, v4, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_add3_u32 v0, v1, v5, v0 |
| ; GFX12-NEXT: flat_store_b32 v[2:3], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq float %a, %b |
| %ogt_result = fcmp ogt float %a, %b |
| %oge_result = fcmp oge float %a, %b |
| %olt_result = fcmp olt float %a, %b |
| %ole_result = fcmp ole float %a, %b |
| %one_result = fcmp one float %a, %b |
| %ord_result = fcmp ord float %a, %b |
| %ueq_result = fcmp ueq float %a, %b |
| %ugt_result = fcmp ugt float %a, %b |
| %uge_result = fcmp uge float %a, %b |
| %ult_result = fcmp ult float %a, %b |
| %ule_result = fcmp ule float %a, %b |
| %une_result = fcmp une float %a, %b |
| %uno_result = fcmp uno float %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |
| |
| define void @fcmp_f64_uniform(double inreg %a, double inreg %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f64_uniform: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f64_e64 s4, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_ge_f64_e64 s6, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_lt_f64_e64 s7, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_le_f64_e64 s8, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_lg_f64_e64 s9, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_o_f64_e64 s10, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s11, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_nle_f64_e64 s12, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_nlt_f64_e64 s13, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_nge_f64_e64 s14, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_ngt_f64_e64 s15, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_neq_f64_e64 s20, s[16:17], s[18:19] |
| ; GFX10-NEXT: v_cmp_u_f64_e64 s16, s[16:17], s[18:19] |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX10-NEXT: s_cselect_b32 s14, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 |
| ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 |
| ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s5 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s6 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s7 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s8 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s9 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s10 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s11 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s12 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s13 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s14 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s15 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s17 |
| ; GFX10-NEXT: s_add_i32 s4, s4, s16 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX10-NEXT: flat_store_dword v[0:1], v2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f64_uniform: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_cmp_eq_f64_e64 s4, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_gt_f64_e64 s5, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_ge_f64_e64 s6, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_lt_f64_e64 s7, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_le_f64_e64 s8, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_lg_f64_e64 s9, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_o_f64_e64 s10, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_nlg_f64_e64 s11, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_nle_f64_e64 s12, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_nlt_f64_e64 s13, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_nge_f64_e64 s14, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_ngt_f64_e64 s15, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_neq_f64_e64 s16, s[0:1], s[2:3] |
| ; GFX12-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3] |
| ; GFX12-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX12-NEXT: s_cselect_b32 s1, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s14, 0 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s15, 0 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s16, 0 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_cmp_lg_u32 s4, 0 |
| ; GFX12-NEXT: s_cselect_b32 s4, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s1, 0 |
| ; GFX12-NEXT: s_cselect_b32 s1, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s3, 0 |
| ; GFX12-NEXT: s_cselect_b32 s3, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s5, 0 |
| ; GFX12-NEXT: s_cselect_b32 s5, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_cselect_b32 s6, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s7, 0 |
| ; GFX12-NEXT: s_cselect_b32 s7, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s8, 0 |
| ; GFX12-NEXT: s_cselect_b32 s8, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s9, 0 |
| ; GFX12-NEXT: s_cselect_b32 s9, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s10, 0 |
| ; GFX12-NEXT: s_cselect_b32 s10, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s11, 0 |
| ; GFX12-NEXT: s_cselect_b32 s11, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s12, 0 |
| ; GFX12-NEXT: s_cselect_b32 s12, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s13, 0 |
| ; GFX12-NEXT: s_cselect_b32 s13, 1, 0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, 1, 0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s4, s1 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s2 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s3 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s6 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s7 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s8 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s10 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s11 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s12 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, s13 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: s_add_co_i32 s0, s1, s0 |
| ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX12-NEXT: flat_store_b32 v[0:1], v2 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq double %a, %b |
| %ogt_result = fcmp ogt double %a, %b |
| %oge_result = fcmp oge double %a, %b |
| %olt_result = fcmp olt double %a, %b |
| %ole_result = fcmp ole double %a, %b |
| %one_result = fcmp one double %a, %b |
| %ord_result = fcmp ord double %a, %b |
| %ueq_result = fcmp ueq double %a, %b |
| %ugt_result = fcmp ugt double %a, %b |
| %uge_result = fcmp uge double %a, %b |
| %ult_result = fcmp ult double %a, %b |
| %ule_result = fcmp ule double %a, %b |
| %une_result = fcmp une double %a, %b |
| %uno_result = fcmp uno double %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |
| |
| define void @fcmp_f64_divergent(double %a, double %b, ptr %p) { |
| ; GFX10-LABEL: fcmp_f64_divergent: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v8, v9 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v7, v10 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v8, v9 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v7, v10 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX10-NEXT: v_add3_u32 v1, v6, v8, v9 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX10-NEXT: v_add3_u32 v0, v1, v7, v0 |
| ; GFX10-NEXT: flat_store_dword v[4:5], v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX12-LABEL: fcmp_f64_divergent: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: s_wait_expcnt 0x0 |
| ; GFX12-NEXT: s_wait_samplecnt 0x0 |
| ; GFX12-NEXT: s_wait_bvhcnt 0x0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v7 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v6, v6, v8, v9 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v6, v6, v7, v10 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v6, v6, v8, v9 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_add3_u32 v6, v6, v7, v10 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo |
| ; GFX12-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3] |
| ; GFX12-NEXT: v_add3_u32 v1, v6, v8, v9 |
| ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) |
| ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_add3_u32 v0, v1, v7, v0 |
| ; GFX12-NEXT: flat_store_b32 v[4:5], v0 |
| ; GFX12-NEXT: s_wait_dscnt 0x0 |
| ; GFX12-NEXT: s_setpc_b64 s[30:31] |
| %oeq_result = fcmp oeq double %a, %b |
| %ogt_result = fcmp ogt double %a, %b |
| %oge_result = fcmp oge double %a, %b |
| %olt_result = fcmp olt double %a, %b |
| %ole_result = fcmp ole double %a, %b |
| %one_result = fcmp one double %a, %b |
| %ord_result = fcmp ord double %a, %b |
| %ueq_result = fcmp ueq double %a, %b |
| %ugt_result = fcmp ugt double %a, %b |
| %uge_result = fcmp uge double %a, %b |
| %ult_result = fcmp ult double %a, %b |
| %ule_result = fcmp ule double %a, %b |
| %une_result = fcmp une double %a, %b |
| %uno_result = fcmp uno double %a, %b |
| %oeq_zext = zext i1 %oeq_result to i32 |
| %ogt_zext = zext i1 %ogt_result to i32 |
| %oge_zext = zext i1 %oge_result to i32 |
| %olt_zext = zext i1 %olt_result to i32 |
| %ole_zext = zext i1 %ole_result to i32 |
| %one_zext = zext i1 %one_result to i32 |
| %ord_zext = zext i1 %ord_result to i32 |
| %ueq_zext = zext i1 %ueq_result to i32 |
| %ugt_zext = zext i1 %ugt_result to i32 |
| %uge_zext = zext i1 %uge_result to i32 |
| %ult_zext = zext i1 %ult_result to i32 |
| %ule_zext = zext i1 %ule_result to i32 |
| %une_zext = zext i1 %une_result to i32 |
| %uno_zext = zext i1 %uno_result to i32 |
| %sum1 = add i32 %oeq_zext, %ogt_zext |
| %sum2 = add i32 %sum1, %oge_zext |
| %sum3 = add i32 %sum2, %olt_zext |
| %sum4 = add i32 %sum3, %ole_zext |
| %sum5 = add i32 %sum4, %one_zext |
| %sum6 = add i32 %sum5, %ord_zext |
| %sum7 = add i32 %sum6, %ueq_zext |
| %sum8 = add i32 %sum7, %ugt_zext |
| %sum9 = add i32 %sum8, %uge_zext |
| %sum10 = add i32 %sum9, %ult_zext |
| %sum11 = add i32 %sum10, %ule_zext |
| %sum12 = add i32 %sum11, %une_zext |
| %result = add i32 %sum12, %uno_zext |
| store i32 %result, ptr %p |
| ret void |
| } |