blob: 496c6597e9afd4f57a81939403aed164d8e5f6a8 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX12
define void @fcmp_f16_uniform(half inreg %a, half inreg %b, ptr %p) {
; GFX10-LABEL: fcmp_f16_uniform:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f16_e64 s4, s16, s17
; GFX10-NEXT: v_cmp_gt_f16_e64 s5, s16, s17
; GFX10-NEXT: v_cmp_ge_f16_e64 s6, s16, s17
; GFX10-NEXT: v_cmp_lt_f16_e64 s7, s16, s17
; GFX10-NEXT: v_cmp_le_f16_e64 s8, s16, s17
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: v_cmp_lg_f16_e64 s9, s16, s17
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: v_cmp_o_f16_e64 s10, s16, s17
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_cmp_nlg_f16_e64 s11, s16, s17
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: v_cmp_nle_f16_e64 s12, s16, s17
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_cmp_nlt_f16_e64 s13, s16, s17
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: v_cmp_nge_f16_e64 s14, s16, s17
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_cmp_ngt_f16_e64 s15, s16, s17
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cmp_neq_f16_e64 s18, s16, s17
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_cmp_u_f16_e64 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_add_i32 s4, s4, s5
; GFX10-NEXT: s_add_i32 s4, s4, s6
; GFX10-NEXT: s_add_i32 s4, s4, s7
; GFX10-NEXT: s_add_i32 s4, s4, s8
; GFX10-NEXT: s_add_i32 s4, s4, s9
; GFX10-NEXT: s_add_i32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s4, s4, s11
; GFX10-NEXT: s_add_i32 s4, s4, s12
; GFX10-NEXT: s_add_i32 s4, s4, s13
; GFX10-NEXT: s_add_i32 s4, s4, s14
; GFX10-NEXT: s_add_i32 s4, s4, s15
; GFX10-NEXT: s_add_i32 s4, s4, s17
; GFX10-NEXT: s_add_i32 s4, s4, s16
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f16_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_eq_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_gt_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_ge_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lt_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_le_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_o_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_nlg_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_nle_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_nlt_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_nge_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_ngt_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_neq_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s14, 1, 0
; GFX12-NEXT: s_cmp_u_f16 s0, s1
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b32 s1, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s3, 0
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s4, 0
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s5, 0
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s7, 0
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s8, 0
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s9, 0
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s10, 0
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s11, 0
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s12, 0
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s13, 0
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s14, 0
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s2
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s4
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s5
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s6
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s7
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s8
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s9
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s10
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s11
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s12
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s13
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq half %a, %b
%ogt_result = fcmp ogt half %a, %b
%oge_result = fcmp oge half %a, %b
%olt_result = fcmp olt half %a, %b
%ole_result = fcmp ole half %a, %b
%one_result = fcmp one half %a, %b
%ord_result = fcmp ord half %a, %b
%ueq_result = fcmp ueq half %a, %b
%ugt_result = fcmp ugt half %a, %b
%uge_result = fcmp uge half %a, %b
%ult_result = fcmp ult half %a, %b
%ule_result = fcmp ule half %a, %b
%une_result = fcmp une half %a, %b
%uno_result = fcmp uno half %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}
define void @fcmp_f16_divergent(half %a, half %b, ptr %p) {
; GFX10-LABEL: fcmp_f16_divergent:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v1, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_add3_u32 v0, v1, v5, v0
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f16_divergent:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v5
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_add3_u32 v1, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v0, v1, v5, v0
; GFX12-NEXT: flat_store_b32 v[2:3], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq half %a, %b
%ogt_result = fcmp ogt half %a, %b
%oge_result = fcmp oge half %a, %b
%olt_result = fcmp olt half %a, %b
%ole_result = fcmp ole half %a, %b
%one_result = fcmp one half %a, %b
%ord_result = fcmp ord half %a, %b
%ueq_result = fcmp ueq half %a, %b
%ugt_result = fcmp ugt half %a, %b
%uge_result = fcmp uge half %a, %b
%ult_result = fcmp ult half %a, %b
%ule_result = fcmp ule half %a, %b
%une_result = fcmp une half %a, %b
%uno_result = fcmp uno half %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}
define void @fcmp_f32_uniform(float inreg %a, float inreg %b, ptr %p) {
; GFX10-LABEL: fcmp_f32_uniform:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f32_e64 s4, s16, s17
; GFX10-NEXT: v_cmp_gt_f32_e64 s5, s16, s17
; GFX10-NEXT: v_cmp_ge_f32_e64 s6, s16, s17
; GFX10-NEXT: v_cmp_lt_f32_e64 s7, s16, s17
; GFX10-NEXT: v_cmp_le_f32_e64 s8, s16, s17
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: v_cmp_lg_f32_e64 s9, s16, s17
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: v_cmp_o_f32_e64 s10, s16, s17
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_cmp_nlg_f32_e64 s11, s16, s17
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: v_cmp_nle_f32_e64 s12, s16, s17
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_cmp_nlt_f32_e64 s13, s16, s17
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: v_cmp_nge_f32_e64 s14, s16, s17
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_cmp_ngt_f32_e64 s15, s16, s17
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cmp_neq_f32_e64 s18, s16, s17
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_cmp_u_f32_e64 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_add_i32 s4, s4, s5
; GFX10-NEXT: s_add_i32 s4, s4, s6
; GFX10-NEXT: s_add_i32 s4, s4, s7
; GFX10-NEXT: s_add_i32 s4, s4, s8
; GFX10-NEXT: s_add_i32 s4, s4, s9
; GFX10-NEXT: s_add_i32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s4, s4, s11
; GFX10-NEXT: s_add_i32 s4, s4, s12
; GFX10-NEXT: s_add_i32 s4, s4, s13
; GFX10-NEXT: s_add_i32 s4, s4, s14
; GFX10-NEXT: s_add_i32 s4, s4, s15
; GFX10-NEXT: s_add_i32 s4, s4, s17
; GFX10-NEXT: s_add_i32 s4, s4, s16
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_eq_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_gt_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_ge_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lt_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_le_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_o_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_nlg_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_nle_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_nlt_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_nge_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_ngt_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_neq_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s14, 1, 0
; GFX12-NEXT: s_cmp_u_f32 s0, s1
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b32 s1, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s3, 0
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s4, 0
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s5, 0
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s7, 0
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s8, 0
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s9, 0
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s10, 0
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s11, 0
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s12, 0
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s13, 0
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s14, 0
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s2
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s4
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s5
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s6
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s7
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s8
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s9
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s10
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s11
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s12
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s13
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq float %a, %b
%ogt_result = fcmp ogt float %a, %b
%oge_result = fcmp oge float %a, %b
%olt_result = fcmp olt float %a, %b
%ole_result = fcmp ole float %a, %b
%one_result = fcmp one float %a, %b
%ord_result = fcmp ord float %a, %b
%ueq_result = fcmp ueq float %a, %b
%ugt_result = fcmp ugt float %a, %b
%uge_result = fcmp uge float %a, %b
%ult_result = fcmp ult float %a, %b
%ule_result = fcmp ule float %a, %b
%une_result = fcmp une float %a, %b
%uno_result = fcmp uno float %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}
define void @fcmp_f32_divergent(float %a, float %b, ptr %p) {
; GFX10-LABEL: fcmp_f32_divergent:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
; GFX10-NEXT: v_add3_u32 v1, v4, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_add3_u32 v0, v1, v5, v0
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f32_divergent:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v5
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v4, v4, v5, v8
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_add3_u32 v1, v4, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v0, v1, v5, v0
; GFX12-NEXT: flat_store_b32 v[2:3], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq float %a, %b
%ogt_result = fcmp ogt float %a, %b
%oge_result = fcmp oge float %a, %b
%olt_result = fcmp olt float %a, %b
%ole_result = fcmp ole float %a, %b
%one_result = fcmp one float %a, %b
%ord_result = fcmp ord float %a, %b
%ueq_result = fcmp ueq float %a, %b
%ugt_result = fcmp ugt float %a, %b
%uge_result = fcmp uge float %a, %b
%ult_result = fcmp ult float %a, %b
%ule_result = fcmp ule float %a, %b
%une_result = fcmp une float %a, %b
%uno_result = fcmp uno float %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}
define void @fcmp_f64_uniform(double inreg %a, double inreg %b, ptr %p) {
; GFX10-LABEL: fcmp_f64_uniform:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f64_e64 s4, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_gt_f64_e64 s5, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_ge_f64_e64 s6, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_lt_f64_e64 s7, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_le_f64_e64 s8, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_lg_f64_e64 s9, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_o_f64_e64 s10, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_nlg_f64_e64 s11, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_nle_f64_e64 s12, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_nlt_f64_e64 s13, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_nge_f64_e64 s14, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_ngt_f64_e64 s15, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_neq_f64_e64 s20, s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_u_f64_e64 s16, s[16:17], s[18:19]
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s20, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: s_cselect_b32 s14, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_add_i32 s4, s4, s5
; GFX10-NEXT: s_add_i32 s4, s4, s6
; GFX10-NEXT: s_add_i32 s4, s4, s7
; GFX10-NEXT: s_add_i32 s4, s4, s8
; GFX10-NEXT: s_add_i32 s4, s4, s9
; GFX10-NEXT: s_add_i32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s4, s4, s11
; GFX10-NEXT: s_add_i32 s4, s4, s12
; GFX10-NEXT: s_add_i32 s4, s4, s13
; GFX10-NEXT: s_add_i32 s4, s4, s14
; GFX10-NEXT: s_add_i32 s4, s4, s15
; GFX10-NEXT: s_add_i32 s4, s4, s17
; GFX10-NEXT: s_add_i32 s4, s4, s16
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f64_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_eq_f64_e64 s4, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_gt_f64_e64 s5, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_ge_f64_e64 s6, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_lt_f64_e64 s7, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_le_f64_e64 s8, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_lg_f64_e64 s9, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_o_f64_e64 s10, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_nlg_f64_e64 s11, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_nle_f64_e64 s12, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_nlt_f64_e64 s13, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_nge_f64_e64 s14, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_ngt_f64_e64 s15, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_neq_f64_e64 s16, s[0:1], s[2:3]
; GFX12-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
; GFX12-NEXT: s_cmp_lg_u32 s4, 0
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s5, 0
; GFX12-NEXT: s_cselect_b32 s1, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s7, 0
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s8, 0
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s9, 0
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s10, 0
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s11, 0
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s12, 0
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s13, 0
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s14, 0
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s15, 0
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s16, 0
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_cmp_lg_u32 s4, 0
; GFX12-NEXT: s_cselect_b32 s4, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_cselect_b32 s1, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s3, 0
; GFX12-NEXT: s_cselect_b32 s3, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s5, 0
; GFX12-NEXT: s_cselect_b32 s5, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
; GFX12-NEXT: s_cselect_b32 s6, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s7, 0
; GFX12-NEXT: s_cselect_b32 s7, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s8, 0
; GFX12-NEXT: s_cselect_b32 s8, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s9, 0
; GFX12-NEXT: s_cselect_b32 s9, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s10, 0
; GFX12-NEXT: s_cselect_b32 s10, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s11, 0
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s12, 0
; GFX12-NEXT: s_cselect_b32 s12, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s13, 0
; GFX12-NEXT: s_cselect_b32 s13, 1, 0
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s4, s1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s2
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s5
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s6
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s7
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s8
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s9
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s10
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s11
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s12
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s1, s1, s13
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s0, s1, s0
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq double %a, %b
%ogt_result = fcmp ogt double %a, %b
%oge_result = fcmp oge double %a, %b
%olt_result = fcmp olt double %a, %b
%ole_result = fcmp ole double %a, %b
%one_result = fcmp one double %a, %b
%ord_result = fcmp ord double %a, %b
%ueq_result = fcmp ueq double %a, %b
%ugt_result = fcmp ugt double %a, %b
%uge_result = fcmp uge double %a, %b
%ult_result = fcmp ult double %a, %b
%ule_result = fcmp ule double %a, %b
%une_result = fcmp une double %a, %b
%uno_result = fcmp uno double %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}
define void @fcmp_f64_divergent(double %a, double %b, ptr %p) {
; GFX10-LABEL: fcmp_f64_divergent:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v7
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add3_u32 v6, v6, v8, v9
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add3_u32 v6, v6, v7, v10
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add3_u32 v6, v6, v8, v9
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add3_u32 v6, v6, v7, v10
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_add3_u32 v1, v6, v8, v9
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_add3_u32 v0, v1, v7, v0
; GFX10-NEXT: flat_store_dword v[4:5], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: fcmp_f64_divergent:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v7
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v6, v6, v8, v9
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v6, v6, v7, v10
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v6, v6, v8, v9
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v6, v6, v7, v10
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo
; GFX12-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
; GFX12-NEXT: v_add3_u32 v1, v6, v8, v9
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v0, v1, v7, v0
; GFX12-NEXT: flat_store_b32 v[4:5], v0
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%oeq_result = fcmp oeq double %a, %b
%ogt_result = fcmp ogt double %a, %b
%oge_result = fcmp oge double %a, %b
%olt_result = fcmp olt double %a, %b
%ole_result = fcmp ole double %a, %b
%one_result = fcmp one double %a, %b
%ord_result = fcmp ord double %a, %b
%ueq_result = fcmp ueq double %a, %b
%ugt_result = fcmp ugt double %a, %b
%uge_result = fcmp uge double %a, %b
%ult_result = fcmp ult double %a, %b
%ule_result = fcmp ule double %a, %b
%une_result = fcmp une double %a, %b
%uno_result = fcmp uno double %a, %b
%oeq_zext = zext i1 %oeq_result to i32
%ogt_zext = zext i1 %ogt_result to i32
%oge_zext = zext i1 %oge_result to i32
%olt_zext = zext i1 %olt_result to i32
%ole_zext = zext i1 %ole_result to i32
%one_zext = zext i1 %one_result to i32
%ord_zext = zext i1 %ord_result to i32
%ueq_zext = zext i1 %ueq_result to i32
%ugt_zext = zext i1 %ugt_result to i32
%uge_zext = zext i1 %uge_result to i32
%ult_zext = zext i1 %ult_result to i32
%ule_zext = zext i1 %ule_result to i32
%une_zext = zext i1 %une_result to i32
%uno_zext = zext i1 %uno_result to i32
%sum1 = add i32 %oeq_zext, %ogt_zext
%sum2 = add i32 %sum1, %oge_zext
%sum3 = add i32 %sum2, %olt_zext
%sum4 = add i32 %sum3, %ole_zext
%sum5 = add i32 %sum4, %one_zext
%sum6 = add i32 %sum5, %ord_zext
%sum7 = add i32 %sum6, %ueq_zext
%sum8 = add i32 %sum7, %ugt_zext
%sum9 = add i32 %sum8, %uge_zext
%sum10 = add i32 %sum9, %ult_zext
%sum11 = add i32 %sum10, %ule_zext
%sum12 = add i32 %sum11, %une_zext
%result = add i32 %sum12, %uno_zext
store i32 %result, ptr %p
ret void
}