| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s |
| |
| define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { |
| ; GFX12-LABEL: copy_flat: |
| ; GFX12: ; %bb.0: ; %entry |
| ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 |
| ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 |
| ; GFX12-NEXT: .LBB0_2: ; %for.body |
| ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-NEXT: s_wait_alu 0xfffe |
| ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
| ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 |
| ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 |
| ; GFX12-NEXT: .LBB0_3: ; %for.end |
| ; GFX12-NEXT: s_endpgm |
| ; |
| ; GFX12-SPREFETCH-LABEL: copy_flat: |
| ; GFX12-SPREFETCH: ; %bb.0: ; %entry |
| ; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 |
| ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 |
| ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body |
| ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe |
| ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
| ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 |
| ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3] |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2 |
| ; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end |
| ; GFX12-SPREFETCH-NEXT: s_endpgm |
| entry: |
| %cmp6.not = icmp eq i32 %n, 0 |
| br i1 %cmp6.not, label %for.end, label %for.body |
| |
| for.body: ; preds = %entry, %for.body |
| %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] |
| %idxprom = zext i32 %i.07 to i64 |
| %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom |
| %ld = load <4 x i32>, ptr %arrayidx, align 4 |
| %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom |
| store <4 x i32> %ld, ptr %arrayidx2, align 4 |
| %inc = add nuw i32 %i.07, 1 |
| %exitcond.not = icmp eq i32 %inc, %n |
| br i1 %exitcond.not, label %for.end, label %for.body |
| |
| for.end: ; preds = %for.body, %entry |
| ret void |
| } |
| |
| define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { |
| ; GFX12-LABEL: copy_global: |
| ; GFX12: ; %bb.0: ; %entry |
| ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB1_3 |
| ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 |
| ; GFX12-NEXT: .LBB1_2: ; %for.body |
| ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 |
| ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] |
| ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB1_2 |
| ; GFX12-NEXT: .LBB1_3: ; %for.end |
| ; GFX12-NEXT: s_endpgm |
| ; |
| ; GFX12-SPREFETCH-LABEL: copy_global: |
| ; GFX12-SPREFETCH: ; %bb.0: ; %entry |
| ; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_3 |
| ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 |
| ; GFX12-SPREFETCH-NEXT: .LBB1_2: ; %for.body |
| ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 |
| ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1] |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2 |
| ; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end |
| ; GFX12-SPREFETCH-NEXT: s_endpgm |
| entry: |
| %cmp6.not = icmp eq i32 %n, 0 |
| br i1 %cmp6.not, label %for.end, label %for.body |
| |
| for.body: ; preds = %entry, %for.body |
| %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] |
| %idxprom = zext i32 %i.07 to i64 |
| %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom |
| %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4 |
| %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom |
| store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 |
| %inc = add nuw i32 %i.07, 1 |
| %exitcond.not = icmp eq i32 %inc, %n |
| br i1 %exitcond.not, label %for.end, label %for.body |
| |
| for.end: ; preds = %for.body, %entry |
| ret void |
| } |
| |
| define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { |
| ; GFX12-LABEL: copy_constant: |
| ; GFX12: ; %bb.0: ; %entry |
| ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB2_3 |
| ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-NEXT: .LBB2_2: ; %for.body |
| ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 |
| ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 |
| ; GFX12-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 |
| ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] |
| ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB2_2 |
| ; GFX12-NEXT: .LBB2_3: ; %for.end |
| ; GFX12-NEXT: s_endpgm |
| ; |
| ; GFX12-SPREFETCH-LABEL: copy_constant: |
| ; GFX12-SPREFETCH: ; %bb.0: ; %entry |
| ; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_3 |
| ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader |
| ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX12-SPREFETCH-NEXT: .LBB2_2: ; %for.body |
| ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 |
| ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 |
| ; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1] |
| ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2 |
| ; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end |
| ; GFX12-SPREFETCH-NEXT: s_endpgm |
| entry: |
| %cmp6.not = icmp eq i32 %n, 0 |
| br i1 %cmp6.not, label %for.end, label %for.body |
| |
| for.body: ; preds = %entry, %for.body |
| %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] |
| %idxprom = zext i32 %i.07 to i64 |
| %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom |
| %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4 |
| %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom |
| store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 |
| %inc = add nuw i32 %i.07, 1 |
| %exitcond.not = icmp eq i32 %inc, %n |
| br i1 %exitcond.not, label %for.end, label %for.body |
| |
| for.end: ; preds = %for.body, %entry |
| ret void |
| } |
| |
| define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { |
| ; GFX12-LABEL: copy_local: |
| ; GFX12: ; %bb.0: ; %entry |
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB3_2 |
| ; GFX12-NEXT: .LBB3_1: ; %for.body |
| ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-NEXT: s_wait_alu 0xfffe |
| ; GFX12-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX12-NEXT: v_mov_b32_e32 v4, s0 |
| ; GFX12-NEXT: s_add_co_i32 s2, s2, -1 |
| ; GFX12-NEXT: s_add_co_i32 s0, s0, 16 |
| ; GFX12-NEXT: s_add_co_i32 s1, s1, 16 |
| ; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 |
| ; GFX12-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 |
| ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 |
| ; GFX12-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 |
| ; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 |
| ; GFX12-NEXT: .LBB3_2: ; %for.end |
| ; GFX12-NEXT: s_endpgm |
| ; |
| ; GFX12-SPREFETCH-LABEL: copy_local: |
| ; GFX12-SPREFETCH: ; %bb.0: ; %entry |
| ; GFX12-SPREFETCH-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2 |
| ; GFX12-SPREFETCH-NEXT: .LBB3_1: ; %for.body |
| ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 |
| ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe |
| ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v4, s0 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s2, s2, -1 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, 16 |
| ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s1, s1, 16 |
| ; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 |
| ; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 |
| ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s2, 0 |
| ; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 |
| ; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1 |
| ; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 |
| ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1 |
| ; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end |
| ; GFX12-SPREFETCH-NEXT: s_endpgm |
| entry: |
| %cmp6.not = icmp eq i32 %n, 0 |
| br i1 %cmp6.not, label %for.end, label %for.body |
| |
| for.body: ; preds = %entry, %for.body |
| %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] |
| %idxprom = zext i32 %i.07 to i64 |
| %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom |
| %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4 |
| %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom |
| store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4 |
| %inc = add nuw i32 %i.07, 1 |
| %exitcond.not = icmp eq i32 %inc, %n |
| br i1 %exitcond.not, label %for.end, label %for.body |
| |
| for.end: ; preds = %for.body, %entry |
| ret void |
| } |