blob: 287a8ab0e52f52e3eff4334088f81ad6c8329c14 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS %s
; if instruction is uniform and there is available instruction, select SALU instruction
define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: uniform_in_vgpr:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0
; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: uniform_in_vgpr:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0
; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%a.i32 = fptoui float %a to i32
%res = add i32 %a.i32, %b
store i32 %res, ptr addrspace(1) %ptr
ret void
}
; copy sgpr to vgpr + readfirstlane vgpr to sgpr combine from rb-legalize
define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: back_to_back_uniform_in_vgpr:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_add_f32_e64 v2, s0, s1
; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2
; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: back_to_back_uniform_in_vgpr:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_add_f32_e64 v2, s0, s1
; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2
; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%add = fadd float %a, %b
%add.i32 = fptoui float %add to i32
%res = add i32 %add.i32, %c
store i32 %res, ptr addrspace(1) %ptr
ret void
}
; fast rules for vector instructions
define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %voffset, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: buffer_load_uniform:
; OLD_RBS: ; %bb.0: ; %.entry
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s4
; OLD_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
; OLD_RBS-NEXT: s_waitcnt vmcnt(0)
; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: buffer_load_uniform:
; NEW_RBS: ; %bb.0: ; %.entry
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s4
; NEW_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
.entry:
%vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%el1 = extractelement <4 x i32> %vec, i64 1
%res = add i32 %el1, 1
store i32 %res, ptr addrspace(1) %ptr
ret void
}
define amdgpu_cs void @buffer_load_divergent(<4 x i32> inreg %rsrc, i32 %voffset, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: buffer_load_divergent:
; OLD_RBS: ; %bb.0: ; %.entry
; OLD_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
; OLD_RBS-NEXT: s_waitcnt vmcnt(0)
; OLD_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4
; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: buffer_load_divergent:
; NEW_RBS: ; %bb.0: ; %.entry
; NEW_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4
; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off
; NEW_RBS-NEXT: s_endpgm
.entry:
%vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%el1 = extractelement <4 x i32> %vec, i64 1
%res = add i32 %el1, 1
store i32 %res, ptr addrspace(1) %ptr
ret void
}
;lowering in rb-legalize (sgpr S64 is legal, vgpr has to be split to S32)
define amdgpu_ps void @vgpr_and_i64(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: vgpr_and_i64:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_and_b32_e32 v0, v0, v2
; OLD_RBS-NEXT: v_and_b32_e32 v1, v1, v3
; OLD_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: vgpr_and_i64:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_and_b32_e32 v0, v0, v2
; NEW_RBS-NEXT: v_and_b32_e32 v1, v1, v3
; NEW_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; NEW_RBS-NEXT: s_endpgm
%res = and i64 %a, %b
store i64 %res, ptr addrspace(1) %ptr
ret void
}
; It is up to user instruction to deal with potential truncated bits in reg.
; Here G_ABS needs to sign extend S16 in reg to S32 and then do S32 G_ABS.
define amdgpu_ps void @abs_sgpr_i16(i16 inreg %arg, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: abs_sgpr_i16:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: s_sext_i32_i16 s0, s0
; OLD_RBS-NEXT: s_abs_i32 s0, s0
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0
; OLD_RBS-NEXT: global_store_short v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: abs_sgpr_i16:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: s_sext_i32_i16 s0, s0
; NEW_RBS-NEXT: s_abs_i32 s0, s0
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_short v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
store i16 %res, ptr addrspace(1) %ptr
ret void
}
define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 inreg %cond) {
; OLD_RBS-LABEL: uniform_i1_phi:
; OLD_RBS: ; %bb.0: ; %A
; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 6
; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0
; OLD_RBS-NEXT: s_cmp_lg_u32 s1, 0
; OLD_RBS-NEXT: s_cbranch_scc1 .LBB6_2
; OLD_RBS-NEXT: ; %bb.1: ; %B
; OLD_RBS-NEXT: s_cmp_lt_u32 s0, 1
; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0
; OLD_RBS-NEXT: .LBB6_2: ; %exit
; OLD_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000
; OLD_RBS-NEXT: s_add_i32 s0, s0, 2
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: uniform_i1_phi:
; NEW_RBS: ; %bb.0: ; %A
; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 6
; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0
; NEW_RBS-NEXT: s_cmp_lg_u32 s1, 0
; NEW_RBS-NEXT: s_cbranch_scc1 .LBB6_2
; NEW_RBS-NEXT: ; %bb.1: ; %B
; NEW_RBS-NEXT: s_cmp_lt_u32 s0, 1
; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0
; NEW_RBS-NEXT: .LBB6_2: ; %exit
; NEW_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000
; NEW_RBS-NEXT: s_add_i32 s0, s0, 2
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
A:
%val_A = icmp uge i32 %tid, 6
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %B, label %exit
B:
%val_B = icmp ult i32 %tid, 1
br label %exit
exit:
%phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
%sel = select i1 %phi, i32 1, i32 2
store i32 %sel, ptr addrspace(1) %out
ret void
}
; this is kind of i1 readfirstlane
; uniform i1 result on instruction that is only available on VALU
define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: vcc_to_scc:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s2
; OLD_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
; OLD_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: vcc_to_scc:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s2
; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
; NEW_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%vcc_to_scc = fcmp oeq float %a, 0.0
%select = select i1 %vcc_to_scc, i32 %b, i32 %c
store i32 %select, ptr addrspace(1) %ptr
ret void
}
; combiner in rb-legalize recognizes sgpr S1 to vcc copy
define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: scc_to_vcc:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 0
; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0
; OLD_RBS-NEXT: s_and_b32 s0, 1, s0
; OLD_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: scc_to_vcc:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 0
; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0
; NEW_RBS-NEXT: s_and_b32 s0, 1, s0
; NEW_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off
; NEW_RBS-NEXT: s_endpgm
%scc_to_vcc = icmp eq i32 %a, 0
%select = select i1 %scc_to_vcc, i32 %b, i32 %c
store i32 %select, ptr addrspace(1) %ptr
ret void
}
; this is only G_TRUNC that is not no-op in global-isel for AMDGPU
define amdgpu_ps void @vgpr_to_vcc_trunc(i32 %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: vgpr_to_vcc_trunc:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_and_b32_e32 v0, 1, v0
; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; OLD_RBS-NEXT: global_store_dword v[3:4], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: vgpr_to_vcc_trunc:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_and_b32_e32 v0, 1, v0
; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; NEW_RBS-NEXT: global_store_dword v[3:4], v0, off
; NEW_RBS-NEXT: s_endpgm
%vcc = trunc i32 %a to i1
%select = select i1 %vcc, i32 %b, i32 %c
store i32 %select, ptr addrspace(1) %ptr
ret void
}
; i1 input to zext and sext is something that survived legalizer (not trunc)
; lower to select
define amdgpu_ps void @zext(i32 inreg %a, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: zext:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10
; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: zext:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10
; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%bool = icmp eq i32 %a, 10
%zext = zext i1 %bool to i32
store i32 %zext, ptr addrspace(1) %ptr
ret void
}
define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: sext:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10
; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0
; OLD_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: sext:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10
; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0
; NEW_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%bool = icmp eq i32 %a, 10
%sext = sext i1 %bool to i32
store i32 %sext, ptr addrspace(1) %ptr
ret void
}
; divergent i1 bitwise, i1 vcc.
; inst selected into s_and_b32 on wave32 or s_and_b64 on wave64.
define amdgpu_ps void @and_i1_vcc(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: and_i1_vcc:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0
; OLD_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1
; OLD_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: and_i1_vcc:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0
; NEW_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1
; NEW_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off
; NEW_RBS-NEXT: s_endpgm
%cmp_a = icmp uge i32 %a, 10
%cmp_b = icmp uge i32 %b, 20
%cc = and i1 %cmp_a, %cmp_b
%res = select i1 %cc, i32 %a, i32 %b
store i32 %res, ptr addrspace(1) %ptr
ret void
}
; uniform i1 bitwise, i32 sgpr. inst selected into s_and_b32.
define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
; OLD_RBS-LABEL: and_i1_scc:
; OLD_RBS: ; %bb.0:
; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 10
; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0
; OLD_RBS-NEXT: s_cmp_ge_u32 s1, 20
; OLD_RBS-NEXT: s_cselect_b32 s3, 1, 0
; OLD_RBS-NEXT: s_and_b32 s2, s2, s3
; OLD_RBS-NEXT: s_and_b32 s2, s2, 1
; OLD_RBS-NEXT: s_cmp_lg_u32 s2, 0
; OLD_RBS-NEXT: s_cselect_b32 s0, s0, s1
; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0
; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: and_i1_scc:
; NEW_RBS: ; %bb.0:
; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 10
; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0
; NEW_RBS-NEXT: s_cmp_ge_u32 s1, 20
; NEW_RBS-NEXT: s_cselect_b32 s3, 1, 0
; NEW_RBS-NEXT: s_and_b32 s2, s2, s3
; NEW_RBS-NEXT: s_and_b32 s2, s2, 1
; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0
; NEW_RBS-NEXT: s_cselect_b32 s0, s0, s1
; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%cmp_a = icmp uge i32 %a, 10
%cmp_b = icmp uge i32 %b, 20
%cc = and i1 %cmp_a, %cmp_b
%res = select i1 %cc, i32 %a, i32 %b
store i32 %res, ptr addrspace(1) %ptr
ret void
}
; old RBS selects sgpr phi because it had sgpr inputs.
define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
; OLD_RBS: ; %bb.0: ; %A
; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; OLD_RBS-NEXT: ; %bb.1: ; %B
; OLD_RBS-NEXT: s_mov_b32 s0, 1
; OLD_RBS-NEXT: ; %bb.2: ; %exit
; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1
; OLD_RBS-NEXT: v_mov_b32_e32 v0, s0
; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs:
; NEW_RBS: ; %bb.0: ; %A
; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo
; NEW_RBS-NEXT: ; %bb.1: ; %B
; NEW_RBS-NEXT: s_mov_b32 s0, 1
; NEW_RBS-NEXT: ; %bb.2: ; %exit
; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1
; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0
; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off
; NEW_RBS-NEXT: s_endpgm
A:
%cmp = icmp eq i32 %a, 0
br i1 %cmp, label %B, label %exit
B:
br label %exit
exit:
%phi = phi i32 [ 0, %A ], [ 1, %B ]
store i32 %phi, ptr addrspace(1) %out
ret void
}
; old RBS assigned vgpr to uniform phi (because one input had undetermined bank)
; and it propagated to mul, which was not wrong.
; new RBS assigns vgpr to destination of mul even though both inputs are sgpr.
; TODO: implement temporal divergence lowering
define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, ptr addrspace(1) %addr) {
; OLD_RBS-LABEL: divergent_because_of_temporal_divergent_use:
; OLD_RBS: ; %bb.0: ; %entry
; OLD_RBS-NEXT: s_mov_b32 s0, -1
; OLD_RBS-NEXT: v_mov_b32_e32 v3, s0
; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: .LBB15_1: ; %loop
; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1
; OLD_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3
; OLD_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3
; OLD_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
; OLD_RBS-NEXT: s_or_b32 s0, vcc_lo, s0
; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; OLD_RBS-NEXT: s_cbranch_execnz .LBB15_1
; OLD_RBS-NEXT: ; %bb.2: ; %exit
; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0
; OLD_RBS-NEXT: v_mul_lo_u32 v0, v3, 10
; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use:
; NEW_RBS: ; %bb.0: ; %entry
; NEW_RBS-NEXT: s_mov_b32 s0, -1
; NEW_RBS-NEXT: v_mov_b32_e32 v3, s0
; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: .LBB15_1: ; %loop
; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1
; NEW_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3
; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3
; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
; NEW_RBS-NEXT: s_or_b32 s0, vcc_lo, s0
; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; NEW_RBS-NEXT: s_cbranch_execnz .LBB15_1
; NEW_RBS-NEXT: ; %bb.2: ; %exit
; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0
; NEW_RBS-NEXT: v_mul_lo_u32 v0, v3, 10
; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off
; NEW_RBS-NEXT: s_endpgm
entry:
br label %loop
loop:
%counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
%f.counter = uitofp i32 %counter to float
%cond = fcmp ogt float %f.counter, %val
%counter.plus.1 = add i32 %counter, 1
br i1 %cond, label %exit, label %loop
exit:
%ceilx10 = mul i32 %counter, 10
store i32 %ceilx10, ptr addrspace(1) %addr
ret void
}
; Variables that hande counter can be allocated to sgprs.
; Machine uniformity analysis claims some of those registers are divergent while
; LLVM-IR uniformity analysis claims corresponding values are uniform.
; TODO: fix this in Machine uniformity analysis.
define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
; OLD_RBS-LABEL: loop_with_2breaks:
; OLD_RBS: ; %bb.0: ; %entry
; OLD_RBS-NEXT: s_mov_b32 s0, 0
; OLD_RBS-NEXT: ; implicit-def: $sgpr1
; OLD_RBS-NEXT: v_mov_b32_e32 v6, s0
; OLD_RBS-NEXT: s_branch .LBB16_3
; OLD_RBS-NEXT: .LBB16_1: ; %Flow3
; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; OLD_RBS-NEXT: s_waitcnt_depctr 0xffe3
; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3
; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo
; OLD_RBS-NEXT: s_and_b32 s3, exec_lo, s4
; OLD_RBS-NEXT: s_or_b32 s1, s1, s3
; OLD_RBS-NEXT: .LBB16_2: ; %Flow
; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s2
; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, s1
; OLD_RBS-NEXT: s_or_b32 s0, s2, s0
; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; OLD_RBS-NEXT: s_cbranch_execz .LBB16_6
; OLD_RBS-NEXT: .LBB16_3: ; %A
; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1
; OLD_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo
; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, -1
; OLD_RBS-NEXT: s_or_b32 s1, s1, s2
; OLD_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off
; OLD_RBS-NEXT: s_waitcnt vmcnt(0)
; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
; OLD_RBS-NEXT: s_and_saveexec_b32 s2, vcc_lo
; OLD_RBS-NEXT: s_cbranch_execz .LBB16_2
; OLD_RBS-NEXT: ; %bb.4: ; %B
; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7
; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
; OLD_RBS-NEXT: s_mov_b32 s4, -1
; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off
; OLD_RBS-NEXT: s_waitcnt vmcnt(0)
; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
; OLD_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo
; OLD_RBS-NEXT: s_cbranch_execz .LBB16_1
; OLD_RBS-NEXT: ; %bb.5: ; %loop.body
; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; OLD_RBS-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
; OLD_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6
; OLD_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
; OLD_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo
; OLD_RBS-NEXT: global_load_dword v9, v[7:8], off
; OLD_RBS-NEXT: v_mov_b32_e32 v6, v10
; OLD_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo
; OLD_RBS-NEXT: s_or_b32 s4, s4, s5
; OLD_RBS-NEXT: s_waitcnt vmcnt(0)
; OLD_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9
; OLD_RBS-NEXT: global_store_dword v[7:8], v9, off
; OLD_RBS-NEXT: s_branch .LBB16_1
; OLD_RBS-NEXT: .LBB16_6: ; %exit
; OLD_RBS-NEXT: s_endpgm
;
; NEW_RBS-LABEL: loop_with_2breaks:
; NEW_RBS: ; %bb.0: ; %entry
; NEW_RBS-NEXT: s_mov_b32 s0, 0
; NEW_RBS-NEXT: ; implicit-def: $sgpr1
; NEW_RBS-NEXT: v_mov_b32_e32 v6, s0
; NEW_RBS-NEXT: s_branch .LBB16_3
; NEW_RBS-NEXT: .LBB16_1: ; %Flow3
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3
; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3
; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo
; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s4
; NEW_RBS-NEXT: s_or_b32 s1, s1, s3
; NEW_RBS-NEXT: .LBB16_2: ; %Flow
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s2
; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, s1
; NEW_RBS-NEXT: s_or_b32 s0, s2, s0
; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; NEW_RBS-NEXT: s_cbranch_execz .LBB16_6
; NEW_RBS-NEXT: .LBB16_3: ; %A
; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1
; NEW_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo
; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, -1
; NEW_RBS-NEXT: s_or_b32 s1, s1, s2
; NEW_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
; NEW_RBS-NEXT: global_load_dword v9, v[9:10], off
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
; NEW_RBS-NEXT: s_and_saveexec_b32 s2, vcc_lo
; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2
; NEW_RBS-NEXT: ; %bb.4: ; %B
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7
; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
; NEW_RBS-NEXT: s_mov_b32 s4, -1
; NEW_RBS-NEXT: global_load_dword v9, v[9:10], off
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
; NEW_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo
; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1
; NEW_RBS-NEXT: ; %bb.5: ; %loop.body
; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1
; NEW_RBS-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
; NEW_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6
; NEW_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
; NEW_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo
; NEW_RBS-NEXT: global_load_dword v9, v[7:8], off
; NEW_RBS-NEXT: v_mov_b32_e32 v6, v10
; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo
; NEW_RBS-NEXT: s_or_b32 s4, s4, s5
; NEW_RBS-NEXT: s_waitcnt vmcnt(0)
; NEW_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9
; NEW_RBS-NEXT: global_store_dword v[7:8], v9, off
; NEW_RBS-NEXT: s_branch .LBB16_1
; NEW_RBS-NEXT: .LBB16_6: ; %exit
; NEW_RBS-NEXT: s_endpgm
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %exit, label %B
B:
%b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
%b.val = load i32, ptr addrspace(1) %b.plus.counter
%b.cond = icmp eq i32 %b.val, 0
br i1 %b.cond, label %exit, label %loop.body
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = icmp ult i32 %counter, 100
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
declare i16 @llvm.abs.i16(i16, i1)
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)