| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s |
| |
| ; Test mempcy lowering where length is given by a complex but constant expression. |
| ; Loop guard should not be necessary since length is positive. |
| |
| @src_array = global [128 x i8] zeroinitializer, align 1 |
| @dst_array = global [128 x i8] zeroinitializer, align 1 |
| |
| define amdgpu_kernel void @_start() { |
| ; CHECK-LABEL: _start: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 |
| ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 |
| ; CHECK-NEXT: s_mov_b64 s[0:1], 0 |
| ; CHECK-NEXT: .LBB0_1: ; %dynamic-memcpy-expansion-main-body |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_getpc_b64 s[2:3] |
| ; CHECK-NEXT: s_add_u32 s2, s2, src_array@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s3, s3, src_array@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_add_u32 s2, s2, s0 |
| ; CHECK-NEXT: s_addc_u32 s3, s3, s1 |
| ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] |
| ; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[0:1] |
| ; CHECK-NEXT: s_getpc_b64 s[2:3] |
| ; CHECK-NEXT: s_add_u32 s2, s2, dst_array@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s3, s3, dst_array@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_add_u32 s2, s2, s0 |
| ; CHECK-NEXT: s_addc_u32 s3, s3, s1 |
| ; CHECK-NEXT: s_add_u32 s0, s0, 16 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] |
| ; CHECK-NEXT: v_cmp_lt_u64_e64 s[2:3], s[0:1], 16 |
| ; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] |
| ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 |
| ; CHECK-NEXT: ; %bb.2: ; %dynamic-memcpy-expansion-residual-cond |
| ; FIXME: Compare should be evaluated at compile time |
| ; CHECK-NEXT: s_cmp_eq_u64 13, 0 |
| ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 |
| ; CHECK-NEXT: ; %bb.3: ; %dynamic-memcpy-expansion-residual-body.preheader |
| ; CHECK-NEXT: s_sub_u32 s4, 29, 13 |
| ; CHECK-NEXT: s_subb_u32 s5, 0, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[0:1] |
| ; CHECK-NEXT: s_add_u32 s0, s0, src_array@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, src_array@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_add_u32 s2, s0, s4 |
| ; CHECK-NEXT: s_addc_u32 s3, s1, s5 |
| ; CHECK-NEXT: s_getpc_b64 s[0:1] |
| ; CHECK-NEXT: s_add_u32 s0, s0, dst_array@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, dst_array@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_add_u32 s4, s0, s4 |
| ; CHECK-NEXT: s_addc_u32 s5, s1, s5 |
| ; CHECK-NEXT: s_mov_b64 s[0:1], 0 |
| ; CHECK-NEXT: .LBB0_4: ; %dynamic-memcpy-expansion-residual-body |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_add_u32 s6, s2, s0 |
| ; CHECK-NEXT: s_addc_u32 s7, s3, s1 |
| ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] |
| ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] |
| ; CHECK-NEXT: s_add_u32 s6, s4, s0 |
| ; CHECK-NEXT: s_addc_u32 s7, s5, s1 |
| ; CHECK-NEXT: s_add_u32 s0, s0, 1 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] |
| ; CHECK-NEXT: v_cmp_lt_u64_e64 s[6:7], s[0:1], 13 |
| ; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v2 |
| ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 |
| ; CHECK-NEXT: .LBB0_5: ; %dynamic-memcpy-post-expansion |
| ; CHECK-NEXT: s_endpgm |
| %src_ptr = getelementptr inbounds [128 x i8], ptr @src_array, i64 0, i64 0 |
| %dst_ptr = getelementptr inbounds [128 x i8], ptr @dst_array, i64 0, i64 0 |
| call void @llvm.memcpy.p0.p0.i64(ptr %dst_ptr, ptr %src_ptr, i64 add (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 13), i1 false) |
| ret void |
| } |
| |
| declare void @llvm.memcpy.p0.p4.i64(ptr noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0 |
| |
| attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } |