| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 |
| ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() #1 |
| declare half @llvm.fabs.f16(half) |
| declare float @llvm.fabs.f32(float) |
| declare double @llvm.fabs.f64(double) |
| |
| ; All nan values are converted to 0xffffffff |
| define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { |
| ; SI-LABEL: v_cnd_nan_nosgpr: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dword s8, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s6, 0 |
| ; SI-NEXT: s_mov_b32 s7, s3 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; SI-NEXT: s_mov_b32 s2, -1 |
| ; SI-NEXT: s_cmp_eq_u32 s8, 0 |
| ; SI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc |
| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: v_cnd_nan_nosgpr: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dword v0, v[0:1] |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: s_cmp_eq_u32 s2, 0 |
| ; VI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s0 |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: v_cnd_nan_nosgpr: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc |
| ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: v_cnd_nan_nosgpr: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc |
| ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: v_cnd_nan_nosgpr: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc |
| ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx |
| %f = load float, ptr addrspace(1) %f.gep |
| %setcc = icmp ne i32 %c, 0 |
| %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f |
| store float %select, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; This requires slightly trickier SGPR operand legalization since the |
| ; single constant bus SGPR usage is the last operand, and it should |
| ; never be moved. |
| ; However on GFX10 constant bus is limited to 2 scalar operands, not one. |
| ; All nan values are converted to 0xffffffff |
| define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { |
| ; SI-LABEL: v_cnd_nan: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_mov_b32 s7, 0xf000 |
| ; SI-NEXT: s_mov_b32 s6, -1 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b32 s4, s0 |
| ; SI-NEXT: s_mov_b32 s5, s1 |
| ; SI-NEXT: s_cmp_eq_u32 s2, 0 |
| ; SI-NEXT: v_mov_b32_e32 v0, s3 |
| ; SI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc |
| ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: v_cnd_nan: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: s_cmp_eq_u32 s2, 0 |
| ; VI-NEXT: v_mov_b32_e32 v0, s3 |
| ; VI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s0 |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: v_cnd_nan: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: v_cnd_nan: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: v_cnd_nan: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, s3, -1 |
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %setcc = icmp ne i32 %c, 0 |
| %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f |
| store float %select, ptr addrspace(1) %out |
| ret void |
| } |
| |
| ; Test different compare and select operand types for optimal code |
| ; shrinking. |
| ; (select (cmp (sgprX, constant)), constant, sgprZ) |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: v_mov_b32_e32 v2, s5 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c |
| ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] |
| ; GFX10-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c |
| ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c |
| ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, s1, 1.0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 1.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dword s4, s[4:5], 0xb |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: v_mov_b32_e32 v2, s4 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v2, s2 |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, s2, 1.0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 1.0, float %x |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: v_mov_b32_e32 v2, s5 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c |
| ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] |
| ; GFX10-NEXT: global_store_dword v0, v1, s[2:3] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c |
| ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c |
| ; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 |
| ; GFX12-NEXT: s_cselect_b32 s0, s1, 0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 0.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dword s4, s[4:5], 0xb |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: v_mov_b32_e32 v2, s4 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v2, s2 |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b32 s2, s2, 0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 0.0, float %x |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dword s6, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dword v3, v[0:1] |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %z = load float, ptr addrspace(1) %z.gep |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 0.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dword s6, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dword v3, v[0:1] |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %z = load float, ptr addrspace(1) %z.gep |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, float 1.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dword s8, s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s7, 0xf000 |
| ; SI-NEXT: s_mov_b32 s6, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 |
| ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] |
| ; SI-NEXT: v_mov_b32_e32 v3, s8 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dword s4, s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dword v3, v[0:1] |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v4, s4 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 |
| ; GFX12-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load float, ptr addrspace(1) %x.gep |
| %setcc = fcmp olt float %x, 0.0 |
| %select = select i1 %setcc, float 1.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s5 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: flat_load_dword v5, v[0:1] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dword v2, v[2:3] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile float, ptr addrspace(1) %z.gep |
| %setcc = fcmp ult float %x, 0.0 |
| %select = select i1 %setcc, float 1.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s5 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: flat_load_dword v5, v[0:1] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dword v2, v[2:3] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile i32, ptr addrspace(1) %x.gep |
| %z = load volatile i32, ptr addrspace(1) %z.gep |
| %setcc = icmp slt i32 %x, 0 |
| %select = select i1 %setcc, i32 2, i32 %z |
| store i32 %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] |
| ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc |
| ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s5 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v5, s1 |
| ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 |
| ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc |
| ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] |
| ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc |
| ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc |
| ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc |
| ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc |
| ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile i64, ptr addrspace(1) %x.gep |
| %z = load volatile i64, ptr addrspace(1) %z.gep |
| %setcc = icmp slt i64 %x, 0 |
| %select = select i1 %setcc, i64 2, i64 %z |
| store i64 %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: v_mov_b32_e32 v5, v2 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 |
| ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc |
| ; VI-NEXT: flat_load_dword v6, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v7, s1 |
| ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 |
| ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc |
| ; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 |
| ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile <4 x float>, ptr addrspace(1) %z.gep |
| %setcc = fcmp ugt float %x, 4.0 |
| %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0> |
| store <4 x float> %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: v_mov_b32_e32 v5, v2 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 |
| ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc |
| ; VI-NEXT: flat_load_dword v6, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v7, s1 |
| ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 |
| ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc |
| ; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 |
| ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile <4 x float>, ptr addrspace(1) %z.gep |
| %setcc = fcmp ugt float %x, 4.0 |
| %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z |
| store <4 x float> %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; This must be swapped as a vector type before the condition has |
| ; multiple uses. |
| define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: v_mov_b32_e32 v5, v2 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 |
| ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc |
| ; VI-NEXT: flat_load_dword v6, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v7, s1 |
| ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 |
| ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc |
| ; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 |
| ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc |
| ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile <4 x float>, ptr addrspace(1) %z.gep |
| %setcc = fcmp ugt float 4.0, %x |
| %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z |
| store <4 x float> %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s6, 0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b32 s7, 0xf000 |
| ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v3, v1 |
| ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[4:5], s[10:11] |
| ; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[10:11], s[6:7] |
| ; SI-NEXT: v_and_b32_e32 v3, 1, v3 |
| ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 |
| ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 |
| ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] |
| ; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v4, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc |
| ; VI-NEXT: flat_load_dword v2, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_ubyte v3, v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 |
| ; VI-NEXT: v_and_b32_e32 v3, 1, v3 |
| ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 |
| ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] |
| ; VI-NEXT: flat_store_byte v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 |
| ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 |
| ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] |
| ; GFX10-NEXT: global_store_byte v0, v1, s[8:9] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 |
| ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 |
| ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] |
| ; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 |
| ; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) |
| ; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 |
| ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] |
| ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] |
| ; GFX12-NEXT: global_store_b8 v0, v1, s[8:9] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile i32, ptr addrspace(1) %x.gep |
| %z = load volatile i1, ptr addrspace(1) %z.gep |
| %setcc = icmp slt i32 %x, 0 |
| %select = select i1 %setcc, i1 true, i1 %z |
| store i1 %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; Different types compared vs. selected |
| define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: v_mov_b32_e32 v4, v2 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000 |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc |
| ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc |
| ; VI-NEXT: flat_load_dword v6, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v3, s1 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 |
| ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6 |
| ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc |
| ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc |
| ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc |
| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc |
| ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile double, ptr addrspace(1) %z.gep |
| %setcc = fcmp ult float %x, 0.0 |
| %select = select i1 %setcc, double 1.0, double %z |
| store double %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; Different types compared vs. selected |
| define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: v_mov_b32_e32 v4, v2 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc |
| ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v2, s3 |
| ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 |
| ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s5 |
| ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 |
| ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc |
| ; VI-NEXT: flat_load_dword v6, v[1:2] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v3, s1 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6 |
| ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc |
| ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc |
| ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc |
| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc |
| ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile i64, ptr addrspace(1) %z.gep |
| %setcc = fcmp one float %x, 0.0 |
| %select = select i1 %setcc, i64 3, i64 %z |
| store i64 %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; Different types compared vs. selected |
| define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 |
| ; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s5 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: flat_load_dword v5, v[0:1] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dword v2, v[2:3] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile i32, ptr addrspace(1) %x.gep |
| %z = load volatile float, ptr addrspace(1) %z.gep |
| %setcc = icmp ugt i32 %x, 1 |
| %select = select i1 %setcc, float 4.0, float %z |
| store float %select, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; FIXME: Should be able to handle multiple uses |
| define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { |
| ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s11, 0xf000 |
| ; SI-NEXT: s_mov_b32 s10, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b64 s[6:7], s[10:11] |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] |
| ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2 |
| ; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc |
| ; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc |
| ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s3 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s5 |
| ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 |
| ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; VI-NEXT: flat_load_dword v5, v[0:1] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_load_dword v2, v[2:3] glc |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5 |
| ; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc |
| ; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc |
| ; VI-NEXT: flat_store_dword v[0:1], v3 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc |
| ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc |
| ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 |
| ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc |
| ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc |
| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_clause 0x1 |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 |
| ; GFX12-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc |
| ; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc |
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_storecnt 0x0 |
| ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS |
| ; GFX12-NEXT: s_wait_storecnt 0x0 |
| ; GFX12-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %tid.ext = sext i32 %tid to i64 |
| %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext |
| %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext |
| %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext |
| %x = load volatile float, ptr addrspace(1) %x.gep |
| %z = load volatile float, ptr addrspace(1) %z.gep |
| %setcc = fcmp ugt float 4.0, %x |
| %select0 = select i1 %setcc, float -1.0, float %z |
| %select1 = select i1 %setcc, float -2.0, float %z |
| store volatile float %select0, ptr addrspace(1) %out.gep |
| store volatile float %select1, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| ; Source modifiers abs/neg only work for f32 |
| define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { |
| ; SI-LABEL: v_cndmask_abs_neg_f16: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dword s8, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s7, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b32 s3, s7 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 |
| ; SI-NEXT: s_mov_b32 s6, -1 |
| ; SI-NEXT: s_cmp_lg_u32 s8, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| |
| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 |
| ; SI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc |
| ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: v_cndmask_abs_neg_f16: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_ushort v0, v[0:1] |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: s_cmp_lg_u32 s2, 0 |
| ; VI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 |
| ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 |
| ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc |
| ; VI-NEXT: v_mov_b32_e32 v0, s0 |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: flat_store_short v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: v_cndmask_abs_neg_f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 |
| ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc |
| ; GFX10-NEXT: global_store_short v2, v0, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: v_cndmask_abs_neg_f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 |
| ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc |
| ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: v_cndmask_abs_neg_f16: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_u16 v0, v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v0 |
| ; GFX12-NEXT: v_xor_b32_e32 v0, 0x8000, v0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc |
| ; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx |
| %f = load half, ptr addrspace(1) %f.gep |
| %f.abs = call half @llvm.fabs.f16(half %f) |
| %f.neg = fneg half %f |
| %setcc = icmp ne i32 %c, 0 |
| %select = select i1 %setcc, half %f.abs, half %f.neg |
| store half %select, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { |
| ; SI-LABEL: v_cndmask_abs_neg_f32: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 |
| ; SI-NEXT: s_load_dword s8, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s3, 0xf000 |
| ; SI-NEXT: s_mov_b32 s6, 0 |
| ; SI-NEXT: s_mov_b32 s7, s3 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; SI-NEXT: s_mov_b32 s2, -1 |
| ; SI-NEXT: s_cmp_lg_u32 s8, 0 |
| ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] |
| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: v_cndmask_abs_neg_f32: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dword v0, v[0:1] |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: s_cmp_lg_u32 s2, 0 |
| ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] |
| ; VI-NEXT: v_mov_b32_e32 v0, s0 |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: flat_store_dword v[0:1], v2 |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: v_cndmask_abs_neg_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] |
| ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: v_cndmask_abs_neg_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] |
| ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: v_cndmask_abs_neg_f32: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] |
| ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx |
| %f = load float, ptr addrspace(1) %f.gep |
| %f.abs = call float @llvm.fabs.f32(float %f) |
| %f.neg = fneg float %f |
| %setcc = icmp ne i32 %c, 0 |
| %select = select i1 %setcc, float %f.abs, float %f.neg |
| store float %select, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { |
| ; SI-LABEL: v_cndmask_abs_neg_f64: |
| ; SI: ; %bb.0: |
| ; SI-NEXT: s_load_dword s8, s[4:5], 0xb |
| ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd |
| ; SI-NEXT: s_mov_b32 s7, 0xf000 |
| ; SI-NEXT: s_mov_b32 s2, 0 |
| ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; SI-NEXT: v_mov_b32_e32 v1, 0 |
| ; SI-NEXT: s_mov_b32 s3, s7 |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 |
| ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 |
| ; SI-NEXT: s_mov_b32 s6, -1 |
| ; SI-NEXT: s_cmp_lg_u32 s8, 0 |
| ; SI-NEXT: s_waitcnt vmcnt(0) |
| ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 |
| ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; SI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc |
| ; SI-NEXT: s_waitcnt lgkmcnt(0) |
| ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 |
| ; SI-NEXT: s_endpgm |
| ; |
| ; VI-LABEL: v_cndmask_abs_neg_f64: |
| ; VI: ; %bb.0: |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: v_mov_b32_e32 v1, s1 |
| ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 |
| ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
| ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
| ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; VI-NEXT: s_cmp_lg_u32 s2, 0 |
| ; VI-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; VI-NEXT: s_waitcnt vmcnt(0) |
| ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 |
| ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc |
| ; VI-NEXT: v_mov_b32_e32 v3, s1 |
| ; VI-NEXT: v_mov_b32_e32 v2, s0 |
| ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; VI-NEXT: s_endpgm |
| ; |
| ; GFX10-LABEL: v_cndmask_abs_neg_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 |
| ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc |
| ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] |
| ; GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-LABEL: v_cndmask_abs_neg_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 |
| ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc |
| ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] |
| ; GFX11-NEXT: s_endpgm |
| ; |
| ; GFX12-LABEL: v_cndmask_abs_neg_f64: |
| ; GFX12: ; %bb.0: |
| ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 |
| ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX12-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[0:1] |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 |
| ; GFX12-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc |
| ; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] |
| ; GFX12-NEXT: s_endpgm |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx |
| %f = load double, ptr addrspace(1) %f.gep |
| %f.abs = call double @llvm.fabs.f64(double %f) |
| %f.neg = fneg double %f |
| %setcc = icmp ne i32 %c, 0 |
| %select = select i1 %setcc, double %f.abs, double %f.neg |
| store double %select, ptr addrspace(1) %out |
| ret void |
| } |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind readnone } |