blob: 874dece6b728dfe33a0848ef0728c2900e1b7663 [file] [log] [blame]
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +01001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Mariusz Sikorae9de91e2025-03-07 11:10:21 +01002; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +01004
5define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
Mariusz Sikorae9de91e2025-03-07 11:10:21 +01006; GFX12-LABEL: copy_flat:
7; GFX12: ; %bb.0: ; %entry
8; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
9; GFX12-NEXT: s_wait_kmcnt 0x0
10; GFX12-NEXT: s_cmp_eq_u32 s6, 0
11; GFX12-NEXT: s_cbranch_scc1 .LBB0_3
12; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
13; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
14; GFX12-NEXT: s_wait_kmcnt 0x0
15; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
16; GFX12-NEXT: .LBB0_2: ; %for.body
17; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
18; GFX12-NEXT: s_wait_alu 0xfffe
19; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
20; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
21; GFX12-NEXT: s_add_co_i32 s6, s6, -1
22; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
23; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
24; GFX12-NEXT: s_cmp_lg_u32 s6, 0
25; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
26; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
27; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3]
28; GFX12-NEXT: s_cbranch_scc1 .LBB0_2
29; GFX12-NEXT: .LBB0_3: ; %for.end
30; GFX12-NEXT: s_endpgm
31;
32; GFX12-SPREFETCH-LABEL: copy_flat:
33; GFX12-SPREFETCH: ; %bb.0: ; %entry
34; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
35; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
36; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
37; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3
38; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
39; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
40; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
41; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
42; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body
43; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
44; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
45; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
46; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
47; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
48; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
49; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
50; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
51; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
52; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
53; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
54; GFX12-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3]
55; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
56; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end
57; GFX12-SPREFETCH-NEXT: s_endpgm
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +010058entry:
59 %cmp6.not = icmp eq i32 %n, 0
60 br i1 %cmp6.not, label %for.end, label %for.body
61
62for.body: ; preds = %entry, %for.body
63 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
64 %idxprom = zext i32 %i.07 to i64
65 %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
66 %ld = load <4 x i32>, ptr %arrayidx, align 4
67 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
68 store <4 x i32> %ld, ptr %arrayidx2, align 4
69 %inc = add nuw i32 %i.07, 1
70 %exitcond.not = icmp eq i32 %inc, %n
71 br i1 %exitcond.not, label %for.end, label %for.body
72
73for.end: ; preds = %for.body, %entry
74 ret void
75}
76
77define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
Mariusz Sikorae9de91e2025-03-07 11:10:21 +010078; GFX12-LABEL: copy_global:
79; GFX12: ; %bb.0: ; %entry
80; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
81; GFX12-NEXT: s_wait_kmcnt 0x0
82; GFX12-NEXT: s_cmp_eq_u32 s6, 0
83; GFX12-NEXT: s_cbranch_scc1 .LBB1_3
84; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
85; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
86; GFX12-NEXT: v_mov_b32_e32 v0, 0
87; GFX12-NEXT: s_wait_kmcnt 0x0
88; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
89; GFX12-NEXT: .LBB1_2: ; %for.body
90; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
91; GFX12-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
92; GFX12-NEXT: s_add_co_i32 s6, s6, -1
93; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
94; GFX12-NEXT: s_cmp_lg_u32 s6, 0
95; GFX12-NEXT: s_wait_loadcnt 0x0
96; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
97; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
98; GFX12-NEXT: s_cbranch_scc1 .LBB1_2
99; GFX12-NEXT: .LBB1_3: ; %for.end
100; GFX12-NEXT: s_endpgm
101;
102; GFX12-SPREFETCH-LABEL: copy_global:
103; GFX12-SPREFETCH: ; %bb.0: ; %entry
104; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
105; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
106; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
107; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_3
108; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
109; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
110; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
111; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
112; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
113; GFX12-SPREFETCH-NEXT: .LBB1_2: ; %for.body
114; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
115; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
116; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
117; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
118; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
119; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
120; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0
121; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
122; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
123; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
124; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end
125; GFX12-SPREFETCH-NEXT: s_endpgm
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +0100126entry:
127 %cmp6.not = icmp eq i32 %n, 0
128 br i1 %cmp6.not, label %for.end, label %for.body
129
130for.body: ; preds = %entry, %for.body
131 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
132 %idxprom = zext i32 %i.07 to i64
133 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
134 %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
135 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
136 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
137 %inc = add nuw i32 %i.07, 1
138 %exitcond.not = icmp eq i32 %inc, %n
139 br i1 %exitcond.not, label %for.end, label %for.body
140
141for.end: ; preds = %for.body, %entry
142 ret void
143}
144
145define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
Mariusz Sikorae9de91e2025-03-07 11:10:21 +0100146; GFX12-LABEL: copy_constant:
147; GFX12: ; %bb.0: ; %entry
148; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
149; GFX12-NEXT: s_wait_kmcnt 0x0
150; GFX12-NEXT: s_cmp_eq_u32 s6, 0
151; GFX12-NEXT: s_cbranch_scc1 .LBB2_3
152; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
153; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
154; GFX12-NEXT: v_mov_b32_e32 v0, 0
155; GFX12-NEXT: .LBB2_2: ; %for.body
156; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
157; GFX12-NEXT: s_wait_kmcnt 0x0
158; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
159; GFX12-NEXT: s_add_co_i32 s6, s6, -1
160; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
161; GFX12-NEXT: s_cmp_lg_u32 s6, 0
162; GFX12-NEXT: s_wait_kmcnt 0x0
163; GFX12-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
164; GFX12-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
165; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
166; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
167; GFX12-NEXT: s_cbranch_scc1 .LBB2_2
168; GFX12-NEXT: .LBB2_3: ; %for.end
169; GFX12-NEXT: s_endpgm
170;
171; GFX12-SPREFETCH-LABEL: copy_constant:
172; GFX12-SPREFETCH: ; %bb.0: ; %entry
173; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
174; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
175; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
176; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_3
177; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
178; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
179; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
180; GFX12-SPREFETCH-NEXT: .LBB2_2: ; %for.body
181; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
182; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
183; GFX12-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
184; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
185; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
186; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
187; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
188; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
189; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
190; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
191; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
192; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
193; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
194; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end
195; GFX12-SPREFETCH-NEXT: s_endpgm
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +0100196entry:
197 %cmp6.not = icmp eq i32 %n, 0
198 br i1 %cmp6.not, label %for.end, label %for.body
199
200for.body: ; preds = %entry, %for.body
201 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
202 %idxprom = zext i32 %i.07 to i64
203 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
204 %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
205 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
206 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
207 %inc = add nuw i32 %i.07, 1
208 %exitcond.not = icmp eq i32 %inc, %n
209 br i1 %exitcond.not, label %for.end, label %for.body
210
211for.end: ; preds = %for.body, %entry
212 ret void
213}
214
215define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
Mariusz Sikorae9de91e2025-03-07 11:10:21 +0100216; GFX12-LABEL: copy_local:
217; GFX12: ; %bb.0: ; %entry
218; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
219; GFX12-NEXT: s_wait_kmcnt 0x0
220; GFX12-NEXT: s_cmp_eq_u32 s2, 0
221; GFX12-NEXT: s_cbranch_scc1 .LBB3_2
222; GFX12-NEXT: .LBB3_1: ; %for.body
223; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
224; GFX12-NEXT: s_wait_alu 0xfffe
225; GFX12-NEXT: v_mov_b32_e32 v2, s1
226; GFX12-NEXT: v_mov_b32_e32 v4, s0
227; GFX12-NEXT: s_add_co_i32 s2, s2, -1
228; GFX12-NEXT: s_add_co_i32 s0, s0, 16
229; GFX12-NEXT: s_add_co_i32 s1, s1, 16
230; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
231; GFX12-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
232; GFX12-NEXT: s_cmp_lg_u32 s2, 0
233; GFX12-NEXT: s_wait_dscnt 0x1
234; GFX12-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
235; GFX12-NEXT: s_wait_dscnt 0x1
236; GFX12-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
237; GFX12-NEXT: s_cbranch_scc1 .LBB3_1
238; GFX12-NEXT: .LBB3_2: ; %for.end
239; GFX12-NEXT: s_endpgm
240;
241; GFX12-SPREFETCH-LABEL: copy_local:
242; GFX12-SPREFETCH: ; %bb.0: ; %entry
243; GFX12-SPREFETCH-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
244; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
245; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0
246; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2
247; GFX12-SPREFETCH-NEXT: .LBB3_1: ; %for.body
248; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
249; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
250; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v2, s1
251; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v4, s0
252; GFX12-SPREFETCH-NEXT: s_add_co_i32 s2, s2, -1
253; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, 16
254; GFX12-SPREFETCH-NEXT: s_add_co_i32 s1, s1, 16
255; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
256; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
257; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s2, 0
258; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1
259; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
260; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1
261; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
262; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
263; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end
264; GFX12-SPREFETCH-NEXT: s_endpgm
Mariusz Sikoraa018c8c2023-12-19 08:32:16 +0100265entry:
266 %cmp6.not = icmp eq i32 %n, 0
267 br i1 %cmp6.not, label %for.end, label %for.body
268
269for.body: ; preds = %entry, %for.body
270 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
271 %idxprom = zext i32 %i.07 to i64
272 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
273 %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
274 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
275 store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
276 %inc = add nuw i32 %i.07, 1
277 %exitcond.not = icmp eq i32 %inc, %n
278 br i1 %exitcond.not, label %for.end, label %for.body
279
280for.end: ; preds = %for.body, %entry
281 ret void
282}