| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s |
| |
| ; Check for correct folding of the constants produced by the |
| ; stepvector into the fadd. The value should not get lost when folding |
| ; through subregister extracts of reg_sequence. |
| define amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38(ptr addrspace(1) readonly captures(none) %.global, ptr addrspace(1) writeonly captures(none) %.global1, i32 %arg2) { |
| ; GFX942-LABEL: stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b97a102776ef47f0e8e894a38: |
| ; GFX942: ; %bb.0: ; %bb |
| ; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10 |
| ; GFX942-NEXT: s_mov_b32 s7, 0 |
| ; GFX942-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX942-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX942-NEXT: s_cbranch_scc1 .LBB0_3 |
| ; GFX942-NEXT: ; %bb.1: ; %.lr.ph.preheader |
| ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; GFX942-NEXT: s_mov_b32 s8, 0x47004600 |
| ; GFX942-NEXT: s_mov_b32 s9, 0x45004400 |
| ; GFX942-NEXT: s_mov_b32 s10, 0x42004000 |
| ; GFX942-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX942-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] |
| ; GFX942-NEXT: .LBB0_2: ; %.lr.ph |
| ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX942-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX942-NEXT: global_load_dwordx4 v[4:7], v2, s[2:3] |
| ; GFX942-NEXT: s_add_u32 s4, s4, 8 |
| ; GFX942-NEXT: s_addc_u32 s5, s5, 0 |
| ; GFX942-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] |
| ; GFX942-NEXT: s_waitcnt vmcnt(0) |
| ; GFX942-NEXT: v_pk_add_f16 v7, v7, s8 |
| ; GFX942-NEXT: v_pk_add_f16 v6, v6, s9 |
| ; GFX942-NEXT: v_pk_add_f16 v5, v5, s10 |
| ; GFX942-NEXT: v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0] |
| ; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] |
| ; GFX942-NEXT: s_add_u32 s0, s0, 16 |
| ; GFX942-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX942-NEXT: s_add_u32 s2, s2, 16 |
| ; GFX942-NEXT: s_addc_u32 s3, s3, 0 |
| ; GFX942-NEXT: s_cbranch_vccnz .LBB0_2 |
| ; GFX942-NEXT: .LBB0_3: ; %._crit_edge |
| ; GFX942-NEXT: s_endpgm |
| bb: |
| %i = tail call <8 x i32> @llvm.stepvector.v8i32() |
| %i3 = sitofp <8 x i32> %i to <8 x half> |
| %i4 = zext i32 %arg2 to i64 |
| %.not = icmp eq i32 %arg2, 0 |
| br i1 %.not, label %._crit_edge, label %.lr.ph |
| |
| .lr.ph: ; preds = %.lr.ph, %bb |
| %i5 = phi i64 [ %i6, %.lr.ph ], [ 0, %bb ] |
| %i6 = add nuw nsw i64 %i5, 8 |
| %i7 = getelementptr inbounds nuw half, ptr addrspace(1) %.global1, i64 %i5 |
| %i8 = load <8 x half>, ptr addrspace(1) %i7, align 2 |
| %i9 = fadd <8 x half> %i8, %i3 |
| %i10 = getelementptr inbounds nuw half, ptr addrspace(1) %.global, i64 %i5 |
| store <8 x half> %i9, ptr addrspace(1) %i10, align 2 |
| %i11 = icmp samesign ult i64 %i6, %i4 |
| br i1 %i11, label %.lr.ph, label %._crit_edge |
| |
| ._crit_edge: ; preds = %.lr.ph, %bb |
| ret void |
| } |
| |
| declare <8 x i32> @llvm.stepvector.v8i32() #0 |
| |
| attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } |