| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s |
| |
| ; Check code generation for memmoves with statically unknown size and all |
| ; combinations of the following address spaces: |
| ; destination address space: 0, 1, 3, 5 |
| ; source address space: 0, 1, 3, 4, 5 |
| |
| define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p0_p0: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow35 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_10 |
| ; CHECK-NEXT: .LBB0_2: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB0_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 |
| ; CHECK-NEXT: .LBB0_6: ; %Flow30 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_8 |
| ; CHECK-NEXT: .LBB0_9: ; %Flow28 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_2 |
| ; CHECK-NEXT: .LBB0_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[4:5], v12 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_12 |
| ; CHECK-NEXT: .LBB0_13: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_15 |
| ; CHECK-NEXT: .LBB0_16: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p0_p1: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow37 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_10 |
| ; CHECK-NEXT: .LBB1_2: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB1_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_5 |
| ; CHECK-NEXT: .LBB1_6: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 |
| ; CHECK-NEXT: .LBB1_9: ; %Flow30 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_2 |
| ; CHECK-NEXT: .LBB1_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v3, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v1, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v12, v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[10:11], v12 |
| ; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, -1, v11, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_12 |
| ; CHECK-NEXT: .LBB1_13: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_15 |
| ; CHECK-NEXT: .LBB1_16: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p0_p3: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, 0 |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] |
| ; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] |
| ; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] |
| ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow39 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_10 |
| ; CHECK-NEXT: .LBB2_2: ; %Flow40 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB2_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[13:16], v4 |
| ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] |
| ; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_5 |
| ; CHECK-NEXT: .LBB2_6: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v3, v2 |
| ; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_8 |
| ; CHECK-NEXT: .LBB2_9: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 |
| ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_2 |
| ; CHECK-NEXT: .LBB2_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4 |
| ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v11, v4 |
| ; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[9:10], v11 |
| ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_12 |
| ; CHECK-NEXT: .LBB2_13: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB2_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[7:10], v2 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: s_or_b32 s7, s4, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB2_15 |
| ; CHECK-NEXT: .LBB2_16: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p0_p4: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow34 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_10 |
| ; CHECK-NEXT: .LBB3_2: ; %Flow35 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB3_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB3_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_5 |
| ; CHECK-NEXT: .LBB3_6: ; %Flow29 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB3_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v4 |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_8 |
| ; CHECK-NEXT: .LBB3_9: ; %Flow27 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB3_2 |
| ; CHECK-NEXT: .LBB3_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB3_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[4:5], v12 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_12 |
| ; CHECK-NEXT: .LBB3_13: ; %Flow33 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB3_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB3_15 |
| ; CHECK-NEXT: .LBB3_16: ; %Flow31 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p0_p5: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, 0 |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] |
| ; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] |
| ; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] |
| ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow39 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_10 |
| ; CHECK-NEXT: .LBB4_2: ; %Flow40 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB4_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB4_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v13, v4, s[0:3], 0 offen |
| ; CHECK-NEXT: buffer_load_dword v14, v4, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v15, v4, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v16, v4, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] |
| ; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_5 |
| ; CHECK-NEXT: .LBB4_6: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB4_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB4_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[0:1], v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_8 |
| ; CHECK-NEXT: .LBB4_9: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 |
| ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB4_2 |
| ; CHECK-NEXT: .LBB4_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB4_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4 |
| ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_byte v[9:10], v11 |
| ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_12 |
| ; CHECK-NEXT: .LBB4_13: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB4_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB4_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen |
| ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: s_or_b32 s7, s4, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB4_15 |
| ; CHECK-NEXT: .LBB4_16: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| |
| define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p1_p0: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow37 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_10 |
| ; CHECK-NEXT: .LBB5_2: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB5_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB5_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_5 |
| ; CHECK-NEXT: .LBB5_6: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB5_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[0:1], v4, off |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_8 |
| ; CHECK-NEXT: .LBB5_9: ; %Flow30 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB5_2 |
| ; CHECK-NEXT: .LBB5_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB5_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[4:5], v12, off |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_12 |
| ; CHECK-NEXT: .LBB5_13: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB5_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB5_15 |
| ; CHECK-NEXT: .LBB5_16: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p1_p1: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow41 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_10 |
| ; CHECK-NEXT: .LBB6_2: ; %Flow42 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB6_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB6_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_5 |
| ; CHECK-NEXT: .LBB6_6: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB6_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[0:1], v4, off |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_8 |
| ; CHECK-NEXT: .LBB6_9: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB6_2 |
| ; CHECK-NEXT: .LBB6_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB6_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[4:5], v12, off |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_12 |
| ; CHECK-NEXT: .LBB6_13: ; %Flow40 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB6_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB6_15 |
| ; CHECK-NEXT: .LBB6_16: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p1_p3: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB7_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v2 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[10:13], v9 |
| ; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB7_2 |
| ; CHECK-NEXT: .LBB7_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB7_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo |
| ; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v7, v2 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[3:4], v7, off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB7_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB7_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p1_p4: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow38 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_10 |
| ; CHECK-NEXT: .LBB8_2: ; %Flow39 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB8_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB8_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v5, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v13, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v6 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off |
| ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_5 |
| ; CHECK-NEXT: .LBB8_6: ; %Flow33 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB8_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 |
| ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s9, s5, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[0:1], v4, off |
| ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_8 |
| ; CHECK-NEXT: .LBB8_9: ; %Flow31 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB8_2 |
| ; CHECK-NEXT: .LBB8_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB8_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off |
| ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 |
| ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] |
| ; CHECK-NEXT: s_or_b32 s8, s4, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[4:5], v12, off |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_12 |
| ; CHECK-NEXT: .LBB8_13: ; %Flow37 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB8_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off |
| ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v5 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, v4 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB8_15 |
| ; CHECK-NEXT: .LBB8_16: ; %Flow35 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p1_p5: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB9_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v2 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v10, v9, s[0:3], 0 offen |
| ; CHECK-NEXT: buffer_load_dword v11, v9, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v12, v9, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v13, v9, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB9_2 |
| ; CHECK-NEXT: .LBB9_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB9_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo |
| ; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: global_store_byte v[3:4], v7, off |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB9_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB9_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| |
| define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p3_p0: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] |
| ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo |
| ; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow39 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_10 |
| ; CHECK-NEXT: .LBB10_2: ; %Flow40 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB10_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB10_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v8 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v0 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] |
| ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 |
| ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12] |
| ; CHECK-NEXT: s_or_b32 s9, s6, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v4, v[13:16] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_5 |
| ; CHECK-NEXT: .LBB10_6: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB10_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] |
| ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] |
| ; CHECK-NEXT: s_or_b32 s9, s6, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v3, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_8 |
| ; CHECK-NEXT: .LBB10_9: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 |
| ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB10_2 |
| ; CHECK-NEXT: .LBB10_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB10_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4 |
| ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] |
| ; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v4, v11 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_12 |
| ; CHECK-NEXT: .LBB10_13: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB10_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo |
| ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 |
| ; CHECK-NEXT: s_mov_b32 s5, 0 |
| ; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo |
| ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] |
| ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v0, v[3:6] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB10_15 |
| ; CHECK-NEXT: .LBB10_16: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p3_p1: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB11_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v9, v[10:13] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB11_2 |
| ; CHECK-NEXT: .LBB11_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB11_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo |
| ; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v0, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB11_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB11_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p3_p3: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v3 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow46 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_10 |
| ; CHECK-NEXT: .LBB12_2: ; %Flow47 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB12_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB12_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v0 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[9:12], v3 |
| ; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v8, v[9:12] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_5 |
| ; CHECK-NEXT: .LBB12_6: ; %Flow41 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB12_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 |
| ; CHECK-NEXT: .LBB12_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v2, v1 |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_8 |
| ; CHECK-NEXT: .LBB12_9: ; %Flow39 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB12_2 |
| ; CHECK-NEXT: .LBB12_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s6, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB12_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 |
| ; CHECK-NEXT: .LBB12_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v8, v7 |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5] |
| ; CHECK-NEXT: s_or_b32 s7, s4, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v6, v8 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_12 |
| ; CHECK-NEXT: .LBB12_13: ; %Flow45 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB12_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 |
| ; CHECK-NEXT: s_mov_b32 s6, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 |
| ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 |
| ; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo |
| ; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[5:8], v4 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] |
| ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v2, v[5:8] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB12_15 |
| ; CHECK-NEXT: .LBB12_16: ; %Flow43 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p3_p4: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB13_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v9, v[10:13] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB13_2 |
| ; CHECK-NEXT: .LBB13_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB13_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo |
| ; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v0, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB13_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB13_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p3_p5: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] |
| ; CHECK-NEXT: s_cbranch_execz .LBB14_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen |
| ; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b128 v8, v[9:12] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB14_2 |
| ; CHECK-NEXT: .LBB14_3: ; %Flow14 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB14_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 |
| ; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: ds_write_b8 v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB14_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB14_7: ; %Flow12 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| |
| define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p5_p0: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] |
| ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo |
| ; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8] |
| ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] |
| ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow39 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_10 |
| ; CHECK-NEXT: .LBB15_2: ; %Flow40 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB15_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s6, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB15_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v10, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v12, v8 |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v11, v7 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v0 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] |
| ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 |
| ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_store_dword v14, v4, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_5 |
| ; CHECK-NEXT: .LBB15_6: ; %Flow34 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB15_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s9, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] |
| ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 |
| ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] |
| ; CHECK-NEXT: s_or_b32 s9, s6, s9 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_8 |
| ; CHECK-NEXT: .LBB15_9: ; %Flow32 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 |
| ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 |
| ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 |
| ; CHECK-NEXT: s_cbranch_execz .LBB15_2 |
| ; CHECK-NEXT: .LBB15_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB15_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4 |
| ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] |
| ; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4 |
| ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v11, v4, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_12 |
| ; CHECK-NEXT: .LBB15_13: ; %Flow38 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB15_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo |
| ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 |
| ; CHECK-NEXT: s_mov_b32 s5, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo |
| ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 |
| ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB15_15 |
| ; CHECK-NEXT: .LBB15_16: ; %Flow36 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p5_p1: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB16_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB16_2 |
| ; CHECK-NEXT: .LBB16_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB16_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo |
| ; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB16_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB16_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p5_p3: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] |
| ; CHECK-NEXT: s_cbranch_execz .LBB17_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_b128 v[9:12], v7 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB17_2 |
| ; CHECK-NEXT: .LBB17_3: ; %Flow14 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB17_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 |
| ; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: ds_read_u8 v2, v1 |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB17_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB17_7: ; %Flow12 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p5_p4: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v4 |
| ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v6, 0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] |
| ; CHECK-NEXT: s_cbranch_execz .LBB18_3 |
| ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v9, v0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 16 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] |
| ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB18_2 |
| ; CHECK-NEXT: .LBB18_3: ; %Flow9 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB18_7 |
| ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo |
| ; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo |
| ; CHECK-NEXT: s_add_u32 s4, s4, 1 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, 0 |
| ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] |
| ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off |
| ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB18_5 |
| ; CHECK-NEXT: ; %bb.6: ; %Flow |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: .LBB18_7: ; %Flow7 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { |
| ; CHECK-LABEL: memmove_p5_p5: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 0 |
| ; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 |
| ; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v7, v3 |
| ; CHECK-NEXT: s_mov_b32 s6, exec_lo |
| ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] |
| ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] |
| ; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 |
| ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_3 |
| ; CHECK-NEXT: ; %bb.1: ; %Flow46 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_10 |
| ; CHECK-NEXT: .LBB19_2: ; %Flow47 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| ; CHECK-NEXT: .LBB19_3: ; %memmove_copy_forward |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB19_6 |
| ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v8, v0 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB19_5: ; %memmove_fwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(3) |
| ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: s_waitcnt vmcnt(2) |
| ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(1) |
| ; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_5 |
| ; CHECK-NEXT: .LBB19_6: ; %Flow41 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB19_9 |
| ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 |
| ; CHECK-NEXT: .LBB19_8: ; %memmove_fwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] |
| ; CHECK-NEXT: s_or_b32 s8, s5, s8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_8 |
| ; CHECK-NEXT: .LBB19_9: ; %Flow39 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 |
| ; CHECK-NEXT: ; implicit-def: $vgpr0 |
| ; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 |
| ; CHECK-NEXT: s_cbranch_execz .LBB19_2 |
| ; CHECK-NEXT: .LBB19_10: ; %memmove_copy_backwards |
| ; CHECK-NEXT: s_and_saveexec_b32 s6, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB19_13 |
| ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 |
| ; CHECK-NEXT: .LBB19_12: ; %memmove_bwd_residual_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: buffer_load_ubyte v8, v7, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7 |
| ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5] |
| ; CHECK-NEXT: s_or_b32 s7, s4, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_byte v8, v6, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_12 |
| ; CHECK-NEXT: .LBB19_13: ; %Flow45 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB19_16 |
| ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader |
| ; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 |
| ; CHECK-NEXT: s_mov_b32 s6, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 |
| ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 |
| ; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop |
| ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: s_clause 0x3 |
| ; CHECK-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: buffer_load_dword v6, v4, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: buffer_load_dword v7, v4, s[0:3], 0 offen |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 |
| ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
| ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(3) |
| ; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:12 |
| ; CHECK-NEXT: s_waitcnt vmcnt(2) |
| ; CHECK-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(1) |
| ; CHECK-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen |
| ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] |
| ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 |
| ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB19_15 |
| ; CHECK-NEXT: .LBB19_16: ; %Flow43 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) |
| ret void |
| } |
| |
| |
| declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 |
| declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 |
| |
| attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } |