blob: bc8bcc622810f512e8d697542088253e131bead2 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
; Check code generation for memmoves with statically unknown size and all
; combinations of the following address spaces:
; destination address space: 0, 1, 3, 5
; source address space: 0, 1, 3, 4, 5
define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p0_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %Flow35
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB0_10
; CHECK-NEXT: .LBB0_2: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB0_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5]
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_6: ; %Flow30
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v4, v[2:3]
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB0_8
; CHECK-NEXT: .LBB0_9: ; %Flow28
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: .LBB0_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v12, v[10:11]
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[4:5], v12
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB0_12
; CHECK-NEXT: .LBB0_13: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB0_15
; CHECK-NEXT: .LBB0_16: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p0_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.1: ; %Flow37
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB1_10
; CHECK-NEXT: .LBB1_2: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB1_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB1_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB1_5
; CHECK-NEXT: .LBB1_6: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB1_8
; CHECK-NEXT: .LBB1_9: ; %Flow30
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB1_2
; CHECK-NEXT: .LBB1_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v3, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v1, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[4:5], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[10:11], v12
; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, -1, v11, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB1_12
; CHECK-NEXT: .LBB1_13: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB1_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB1_15
; CHECK-NEXT: .LBB1_16: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p0_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v8, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: v_and_b32_e32 v5, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6]
; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB2_3
; CHECK-NEXT: ; %bb.1: ; %Flow39
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB2_10
; CHECK-NEXT: .LBB2_2: ; %Flow40
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB2_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB2_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v1
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: v_mov_b32_e32 v11, v5
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[13:16], v4
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_5
; CHECK-NEXT: .LBB2_6: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB2_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v3, v2
; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_8
; CHECK-NEXT: .LBB2_9: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB2_2
; CHECK-NEXT: .LBB2_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB2_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v11, v4
; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[9:10], v11
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB2_12
; CHECK-NEXT: .LBB2_13: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB2_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB2_15
; CHECK-NEXT: .LBB2_16: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p0_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB3_3
; CHECK-NEXT: ; %bb.1: ; %Flow34
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB3_10
; CHECK-NEXT: .LBB3_2: ; %Flow35
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB3_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB3_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB3_5
; CHECK-NEXT: .LBB3_6: ; %Flow29
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB3_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB3_8
; CHECK-NEXT: .LBB3_9: ; %Flow27
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB3_2
; CHECK-NEXT: .LBB3_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB3_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[4:5], v12
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB3_12
; CHECK-NEXT: .LBB3_13: ; %Flow33
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB3_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB3_15
; CHECK-NEXT: .LBB3_16: ; %Flow31
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p0_p5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v8, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: v_and_b32_e32 v5, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6]
; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB4_3
; CHECK-NEXT: ; %bb.1: ; %Flow39
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB4_10
; CHECK-NEXT: .LBB4_2: ; %Flow40
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB4_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB4_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v1
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: v_mov_b32_e32 v11, v5
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v13, v4, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v14, v4, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v15, v4, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v16, v4, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB4_5
; CHECK-NEXT: .LBB4_6: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB4_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB4_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB4_8
; CHECK-NEXT: .LBB4_9: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB4_2
; CHECK-NEXT: .LBB4_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB4_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[9:10], v11
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB4_12
; CHECK-NEXT: .LBB4_13: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB4_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB4_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB4_15
; CHECK-NEXT: .LBB4_16: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p1_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB5_3
; CHECK-NEXT: ; %bb.1: ; %Flow37
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB5_10
; CHECK-NEXT: .LBB5_2: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB5_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB5_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5]
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB5_5
; CHECK-NEXT: .LBB5_6: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB5_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v4, v[2:3]
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB5_8
; CHECK-NEXT: .LBB5_9: ; %Flow30
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB5_2
; CHECK-NEXT: .LBB5_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB5_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v12, v[10:11]
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB5_12
; CHECK-NEXT: .LBB5_13: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB5_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB5_15
; CHECK-NEXT: .LBB5_16: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p1_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB6_3
; CHECK-NEXT: ; %bb.1: ; %Flow41
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB6_10
; CHECK-NEXT: .LBB6_2: ; %Flow42
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB6_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB6_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB6_5
; CHECK-NEXT: .LBB6_6: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB6_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB6_8
; CHECK-NEXT: .LBB6_9: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB6_2
; CHECK-NEXT: .LBB6_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB6_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB6_12
; CHECK-NEXT: .LBB6_13: ; %Flow40
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB6_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB6_15
; CHECK-NEXT: .LBB6_16: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p1_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB7_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v2
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[10:13], v9
; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB7_2
; CHECK-NEXT: .LBB7_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB7_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v7, v2
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_byte v[3:4], v7, off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB7_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB7_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p1_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
; CHECK-NEXT: v_mov_b32_e32 v9, 0
; CHECK-NEXT: v_and_b32_e32 v6, -16, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB8_3
; CHECK-NEXT: ; %bb.1: ; %Flow38
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB8_10
; CHECK-NEXT: .LBB8_2: ; %Flow39
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB8_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: v_mov_b32_e32 v11, v1
; CHECK-NEXT: v_mov_b32_e32 v13, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v12, v6
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB8_5
; CHECK-NEXT: .LBB8_6: ; %Flow33
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB8_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB8_8
; CHECK-NEXT: .LBB8_9: ; %Flow31
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB8_2
; CHECK-NEXT: .LBB8_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB8_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB8_12
; CHECK-NEXT: .LBB8_13: ; %Flow37
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB8_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB8_15
; CHECK-NEXT: .LBB8_16: ; %Flow35
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p1_p5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB9_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v2
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v10, v9, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v11, v9, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v12, v9, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v13, v9, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB9_2
; CHECK-NEXT: .LBB9_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB9_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[3:4], v7, off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB9_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB9_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p3_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6]
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo
; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB10_3
; CHECK-NEXT: ; %bb.1: ; %Flow39
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB10_10
; CHECK-NEXT: .LBB10_2: ; %Flow40
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB10_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB10_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v2
; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v9, v1
; CHECK-NEXT: v_mov_b32_e32 v11, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_write_b128 v4, v[13:16]
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB10_5
; CHECK-NEXT: .LBB10_6: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB10_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v2, v[0:1]
; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_write_b8 v3, v2
; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB10_8
; CHECK-NEXT: .LBB10_9: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8
; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB10_2
; CHECK-NEXT: .LBB10_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB10_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v11, v[9:10]
; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_write_b8 v4, v11
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB10_12
; CHECK-NEXT: .LBB10_13: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB10_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_write_b128 v0, v[3:6]
; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_cbranch_execnz .LBB10_15
; CHECK-NEXT: .LBB10_16: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p3_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB11_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b128 v9, v[10:13]
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB11_2
; CHECK-NEXT: .LBB11_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB11_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: global_load_ubyte v3, v[3:4], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b8 v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB11_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB11_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p3_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_and_b32_e32 v4, 15, v2
; CHECK-NEXT: v_and_b32_e32 v6, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v7, v3
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB12_3
; CHECK-NEXT: ; %bb.1: ; %Flow46
; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6
; CHECK-NEXT: s_cbranch_execnz .LBB12_10
; CHECK-NEXT: .LBB12_2: ; %Flow47
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB12_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB12_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v3, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[9:12], v3
; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_write_b128 v8, v[9:12]
; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB12_5
; CHECK-NEXT: .LBB12_6: ; %Flow41
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB12_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v2, -16, v2
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
; CHECK-NEXT: .LBB12_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v2, v1
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_write_b8 v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB12_8
; CHECK-NEXT: .LBB12_9: ; %Flow39
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6
; CHECK-NEXT: s_cbranch_execz .LBB12_2
; CHECK-NEXT: .LBB12_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s6, s4
; CHECK-NEXT: s_cbranch_execz .LBB12_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7
; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7
; CHECK-NEXT: .LBB12_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v8, v7
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_write_b8 v6, v8
; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB12_12
; CHECK-NEXT: .LBB12_13: ; %Flow45
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB12_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v5, -16, v2
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[5:8], v4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_write_b128 v2, v[5:8]
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB12_15
; CHECK-NEXT: .LBB12_16: ; %Flow43
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p3_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB13_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b128 v9, v[10:13]
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB13_2
; CHECK-NEXT: .LBB13_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB13_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: global_load_ubyte v3, v[3:4], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b8 v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB13_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB13_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p3_p5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB14_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b128 v8, v[9:12]
; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB14_2
; CHECK-NEXT: .LBB14_3: ; %Flow14
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB14_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b8 v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB14_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB14_7: ; %Flow12
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p5_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6]
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo
; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10]
; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB15_3
; CHECK-NEXT: ; %bb.1: ; %Flow39
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execnz .LBB15_10
; CHECK-NEXT: .LBB15_2: ; %Flow40
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB15_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s6, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB15_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v10, v2
; CHECK-NEXT: v_mov_b32_e32 v12, v8
; CHECK-NEXT: v_mov_b32_e32 v9, v1
; CHECK-NEXT: v_mov_b32_e32 v11, v7
; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v14, v4, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB15_5
; CHECK-NEXT: .LBB15_6: ; %Flow34
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB15_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v2, v[0:1]
; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB15_8
; CHECK-NEXT: .LBB15_9: ; %Flow32
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8
; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
; CHECK-NEXT: s_cbranch_execz .LBB15_2
; CHECK-NEXT: .LBB15_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB15_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v11, v[9:10]
; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_byte v11, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB15_12
; CHECK-NEXT: .LBB15_13: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB15_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0
; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_cbranch_execnz .LBB15_15
; CHECK-NEXT: .LBB15_16: ; %Flow36
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p5_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB16_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB16_2
; CHECK-NEXT: .LBB16_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB16_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: global_load_ubyte v3, v[3:4], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB16_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB16_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p5_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB17_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[9:12], v7
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB17_2
; CHECK-NEXT: .LBB17_3: ; %Flow14
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB17_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v2, v1
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB17_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB17_7: ; %Flow12
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p5_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8]
; CHECK-NEXT: s_cbranch_execz .LBB18_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v9, v0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB18_2
; CHECK-NEXT: .LBB18_3: ; %Flow9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6]
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB18_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
; CHECK-NEXT: global_load_ubyte v3, v[3:4], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB18_5
; CHECK-NEXT: ; %bb.6: ; %Flow
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: .LBB18_7: ; %Flow7
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) {
; CHECK-LABEL: memmove_p5_p5:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_and_b32_e32 v4, 15, v2
; CHECK-NEXT: v_and_b32_e32 v6, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v7, v3
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB19_3
; CHECK-NEXT: ; %bb.1: ; %Flow46
; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6
; CHECK-NEXT: s_cbranch_execnz .LBB19_10
; CHECK-NEXT: .LBB19_2: ; %Flow47
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB19_3: ; %memmove_copy_forward
; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB19_6
; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
; CHECK-NEXT: v_mov_b32_e32 v3, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB19_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB19_5
; CHECK-NEXT: .LBB19_6: ; %Flow41
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB19_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v2, -16, v2
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
; CHECK-NEXT: .LBB19_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB19_8
; CHECK-NEXT: .LBB19_9: ; %Flow39
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6
; CHECK-NEXT: s_cbranch_execz .LBB19_2
; CHECK-NEXT: .LBB19_10: ; %memmove_copy_backwards
; CHECK-NEXT: s_and_saveexec_b32 s6, s4
; CHECK-NEXT: s_cbranch_execz .LBB19_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7
; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7
; CHECK-NEXT: .LBB19_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v8, v7, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_byte v8, v6, s[0:3], 0 offen
; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB19_12
; CHECK-NEXT: .LBB19_13: ; %Flow45
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB19_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v5, -16, v2
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v4, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v7, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB19_15
; CHECK-NEXT: .LBB19_16: ; %Flow43
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false)
ret void
}
declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0
declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }