blob: e871b80cbe29eafa6568abac493b279e85d06d8d [file] [log] [blame]
Jay Foadba5c4ac2021-08-03 17:13:02 +01001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
Matt Arsenaultb5bc2052022-11-29 18:26:06 -050022define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +010023; SI-LABEL: s_cttz_i32:
24; SI: ; %bb.0:
Jay Foad2b639332021-08-05 14:32:25 +010025; SI-NEXT: s_load_dword s2, s[0:1], 0xb
Jay Foadba5c4ac2021-08-03 17:13:02 +010026; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT: s_mov_b32 s3, 0xf000
28; SI-NEXT: s_waitcnt lgkmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +010029; SI-NEXT: s_ff1_i32_b32 s2, s2
30; SI-NEXT: s_min_u32 s4, s2, 32
Jay Foadba5c4ac2021-08-03 17:13:02 +010031; SI-NEXT: s_mov_b32 s2, -1
Jay Foad2b639332021-08-05 14:32:25 +010032; SI-NEXT: v_mov_b32_e32 v0, s4
Jay Foadba5c4ac2021-08-03 17:13:02 +010033; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT: s_endpgm
35;
36; VI-LABEL: s_cttz_i32:
37; VI: ; %bb.0:
Austin Kerbowda067ed2021-11-10 09:59:31 -080038; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
39; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
40; VI-NEXT: s_mov_b32 s3, 0xf000
41; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +010042; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -080043; VI-NEXT: s_ff1_i32_b32 s4, s4
44; VI-NEXT: s_min_u32 s4, s4, 32
45; VI-NEXT: v_mov_b32_e32 v0, s4
46; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +010047; VI-NEXT: s_endpgm
48;
49; EG-LABEL: s_cttz_i32:
50; EG: ; %bb.0:
51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT: CF_END
54; EG-NEXT: PAD
55; EG-NEXT: ALU clause starting at 4:
56; EG-NEXT: FFBL_INT * T0.W, KC0[2].Z,
57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_cttz_i32:
62; GFX10: ; %bb.0:
63; GFX10-NEXT: s_clause 0x1
64; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
65; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
66; GFX10-NEXT: v_mov_b32_e32 v0, 0
67; GFX10-NEXT: s_waitcnt lgkmcnt(0)
68; GFX10-NEXT: s_ff1_i32_b32 s0, s4
Jay Foad2b639332021-08-05 14:32:25 +010069; GFX10-NEXT: s_min_u32 s0, s0, 32
Jay Foadba5c4ac2021-08-03 17:13:02 +010070; GFX10-NEXT: v_mov_b32_e32 v1, s0
71; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
72; GFX10-NEXT: s_endpgm
73;
74; GFX10-GISEL-LABEL: s_cttz_i32:
75; GFX10-GISEL: ; %bb.0:
76; GFX10-GISEL-NEXT: s_clause 0x1
77; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
78; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
79; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4
Jay Foad83610d42021-08-03 17:11:08 +010082; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32
Jay Foadba5c4ac2021-08-03 17:13:02 +010083; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
84; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
85; GFX10-GISEL-NEXT: s_endpgm
86 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
Matt Arsenaultb5bc2052022-11-29 18:26:06 -050087 store i32 %cttz, ptr addrspace(1) %out, align 4
Jay Foadba5c4ac2021-08-03 17:13:02 +010088 ret void
89}
90
Matt Arsenaultb5bc2052022-11-29 18:26:06 -050091define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +010092; SI-LABEL: v_cttz_i32:
93; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +090094; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
95; SI-NEXT: s_mov_b32 s7, 0xf000
96; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +010097; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
98; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +090099; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100100; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900101; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
102; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
103; SI-NEXT: s_mov_b32 s6, -1
104; SI-NEXT: s_mov_b32 s4, s0
105; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100106; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100107; SI-NEXT: v_ffbl_b32_e32 v0, v0
108; SI-NEXT: v_min_u32_e32 v0, 32, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900109; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100110; SI-NEXT: s_endpgm
111;
112; VI-LABEL: v_cttz_i32:
113; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900114; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100115; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100116; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800117; VI-NEXT: v_mov_b32_e32 v1, s3
118; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100119; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
120; VI-NEXT: flat_load_dword v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800121; VI-NEXT: s_mov_b32 s3, 0xf000
122; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100123; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100124; VI-NEXT: v_ffbl_b32_e32 v0, v0
125; VI-NEXT: v_min_u32_e32 v0, 32, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800126; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100127; VI-NEXT: s_endpgm
128;
129; EG-LABEL: v_cttz_i32:
130; EG: ; %bb.0:
131; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT: TEX 0 @6
133; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT: CF_END
136; EG-NEXT: PAD
137; EG-NEXT: Fetch clause starting at 6:
138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT: ALU clause starting at 8:
140; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT: ALU clause starting at 11:
144; EG-NEXT: FFBL_INT * T0.W, T0.X,
145; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_cttz_i32:
150; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900151; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100152; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad2b639332021-08-05 14:32:25 +0100153; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100154; GFX10-NEXT: s_waitcnt lgkmcnt(0)
155; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
156; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100157; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
158; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
159; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100160; GFX10-NEXT: s_endpgm
161;
162; GFX10-GISEL-LABEL: v_cttz_i32:
163; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900164; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100165; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad83610d42021-08-03 17:11:08 +0100166; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100167; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
168; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
169; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +0100170; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
171; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100172; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
173; GFX10-GISEL-NEXT: s_endpgm
174 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500175 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
176 %val = load i32, ptr addrspace(1) %in.gep, align 4
Jay Foadba5c4ac2021-08-03 17:13:02 +0100177 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500178 store i32 %cttz, ptr addrspace(1) %out, align 4
Jay Foadba5c4ac2021-08-03 17:13:02 +0100179 ret void
180}
181
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500182define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100183; SI-LABEL: v_cttz_v2i32:
184; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900185; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
186; SI-NEXT: s_mov_b32 s7, 0xf000
187; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100188; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
189; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +0900190; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100191; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900192; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
193; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
194; SI-NEXT: s_mov_b32 s6, -1
195; SI-NEXT: s_mov_b32 s4, s0
196; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100197; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100198; SI-NEXT: v_ffbl_b32_e32 v1, v1
199; SI-NEXT: v_ffbl_b32_e32 v0, v0
200; SI-NEXT: v_min_u32_e32 v1, 32, v1
201; SI-NEXT: v_min_u32_e32 v0, 32, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900202; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100203; SI-NEXT: s_endpgm
204;
205; VI-LABEL: v_cttz_v2i32:
206; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900207; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100208; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100209; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800210; VI-NEXT: v_mov_b32_e32 v1, s3
211; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100212; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
213; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800214; VI-NEXT: s_mov_b32 s3, 0xf000
215; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100216; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100217; VI-NEXT: v_ffbl_b32_e32 v1, v1
218; VI-NEXT: v_ffbl_b32_e32 v0, v0
219; VI-NEXT: v_min_u32_e32 v1, 32, v1
220; VI-NEXT: v_min_u32_e32 v0, 32, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800221; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100222; VI-NEXT: s_endpgm
223;
224; EG-LABEL: v_cttz_v2i32:
225; EG: ; %bb.0:
226; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
227; EG-NEXT: TEX 0 @6
228; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
229; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
230; EG-NEXT: CF_END
231; EG-NEXT: PAD
232; EG-NEXT: Fetch clause starting at 6:
233; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
234; EG-NEXT: ALU clause starting at 8:
235; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
236; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
237; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
238; EG-NEXT: ALU clause starting at 11:
239; EG-NEXT: FFBL_INT * T0.W, T0.Y,
240; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
241; EG-NEXT: FFBL_INT * T0.W, T0.X,
242; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
243; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
244; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
245; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
246;
247; GFX10-LABEL: v_cttz_v2i32:
248; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900249; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100250; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Jay Foad2b639332021-08-05 14:32:25 +0100251; GFX10-NEXT: v_mov_b32_e32 v2, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100252; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
254; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100255; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
256; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
257; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
258; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
259; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100260; GFX10-NEXT: s_endpgm
261;
262; GFX10-GISEL-LABEL: v_cttz_v2i32:
263; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900264; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100265; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
Jay Foad83610d42021-08-03 17:11:08 +0100266; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100267; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
268; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
269; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +0100270; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
271; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
272; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
273; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100274; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
275; GFX10-GISEL-NEXT: s_endpgm
276 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500277 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
278 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
Jay Foadba5c4ac2021-08-03 17:13:02 +0100279 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500280 store <2 x i32> %cttz, ptr addrspace(1) %out, align 8
Jay Foadba5c4ac2021-08-03 17:13:02 +0100281 ret void
282}
283
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500284define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100285; SI-LABEL: v_cttz_v4i32:
286; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900287; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
288; SI-NEXT: s_mov_b32 s7, 0xf000
289; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100290; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
291; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +0900292; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100293; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900294; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
295; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
296; SI-NEXT: s_mov_b32 s6, -1
297; SI-NEXT: s_mov_b32 s4, s0
298; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100299; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100300; SI-NEXT: v_ffbl_b32_e32 v3, v3
301; SI-NEXT: v_ffbl_b32_e32 v2, v2
302; SI-NEXT: v_ffbl_b32_e32 v1, v1
303; SI-NEXT: v_ffbl_b32_e32 v0, v0
304; SI-NEXT: v_min_u32_e32 v3, 32, v3
305; SI-NEXT: v_min_u32_e32 v2, 32, v2
306; SI-NEXT: v_min_u32_e32 v1, 32, v1
307; SI-NEXT: v_min_u32_e32 v0, 32, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900308; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100309; SI-NEXT: s_endpgm
310;
311; VI-LABEL: v_cttz_v4i32:
312; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900313; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100314; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100315; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800316; VI-NEXT: v_mov_b32_e32 v1, s3
317; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100318; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
319; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800320; VI-NEXT: s_mov_b32 s3, 0xf000
321; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100322; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100323; VI-NEXT: v_ffbl_b32_e32 v3, v3
324; VI-NEXT: v_ffbl_b32_e32 v2, v2
325; VI-NEXT: v_ffbl_b32_e32 v1, v1
326; VI-NEXT: v_ffbl_b32_e32 v0, v0
327; VI-NEXT: v_min_u32_e32 v3, 32, v3
328; VI-NEXT: v_min_u32_e32 v2, 32, v2
329; VI-NEXT: v_min_u32_e32 v1, 32, v1
330; VI-NEXT: v_min_u32_e32 v0, 32, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800331; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100332; VI-NEXT: s_endpgm
333;
334; EG-LABEL: v_cttz_v4i32:
335; EG: ; %bb.0:
336; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
337; EG-NEXT: TEX 0 @6
338; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
339; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
340; EG-NEXT: CF_END
341; EG-NEXT: PAD
342; EG-NEXT: Fetch clause starting at 6:
343; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
344; EG-NEXT: ALU clause starting at 8:
345; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
346; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
347; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
348; EG-NEXT: ALU clause starting at 11:
349; EG-NEXT: FFBL_INT * T1.W, T0.W,
350; EG-NEXT: FFBL_INT T2.W, T0.Z,
351; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
352; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
353; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
354; EG-NEXT: FFBL_INT * T1.W, T0.Y,
355; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
356; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
357; EG-NEXT: FFBL_INT * T1.W, T0.X,
358; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
359; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
360; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
361; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
362;
363; GFX10-LABEL: v_cttz_v4i32:
364; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900365; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100366; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100367; GFX10-NEXT: v_mov_b32_e32 v4, 0
368; GFX10-NEXT: s_waitcnt lgkmcnt(0)
369; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
370; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +0100371; GFX10-NEXT: v_ffbl_b32_e32 v3, v3
372; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
373; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
374; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
375; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
376; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
377; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
378; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100379; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
380; GFX10-NEXT: s_endpgm
381;
382; GFX10-GISEL-LABEL: v_cttz_v4i32:
383; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900384; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100385; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
Jay Foad83610d42021-08-03 17:11:08 +0100386; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100387; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
388; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
389; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +0100390; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
391; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
392; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
393; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
394; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
395; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
396; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
397; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3
Jay Foadba5c4ac2021-08-03 17:13:02 +0100398; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
399; GFX10-GISEL-NEXT: s_endpgm
400 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500401 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
402 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
Jay Foadba5c4ac2021-08-03 17:13:02 +0100403 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500404 store <4 x i32> %cttz, ptr addrspace(1) %out, align 16
Jay Foadba5c4ac2021-08-03 17:13:02 +0100405 ret void
406}
407
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500408define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100409; SI-LABEL: v_cttz_i8:
410; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900411; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
412; SI-NEXT: s_mov_b32 s7, 0xf000
413; SI-NEXT: s_mov_b32 s6, -1
414; SI-NEXT: s_mov_b32 s10, s6
415; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100416; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900417; SI-NEXT: s_mov_b32 s8, s2
418; SI-NEXT: s_mov_b32 s9, s3
419; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
420; SI-NEXT: s_mov_b32 s4, s0
421; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100422; SI-NEXT: s_waitcnt vmcnt(0)
423; SI-NEXT: v_or_b32_e32 v0, 0x100, v0
424; SI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900425; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100426; SI-NEXT: s_endpgm
427;
428; VI-LABEL: v_cttz_i8:
429; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900430; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
431; VI-NEXT: s_mov_b32 s7, 0xf000
432; VI-NEXT: s_mov_b32 s6, -1
433; VI-NEXT: s_mov_b32 s10, s6
434; VI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100435; VI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900436; VI-NEXT: s_mov_b32 s8, s2
437; VI-NEXT: s_mov_b32 s9, s3
438; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
439; VI-NEXT: s_mov_b32 s4, s0
440; VI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100441; VI-NEXT: s_waitcnt vmcnt(0)
442; VI-NEXT: v_or_b32_e32 v0, 0x100, v0
443; VI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900444; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100445; VI-NEXT: s_endpgm
446;
447; EG-LABEL: v_cttz_i8:
448; EG: ; %bb.0:
449; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
450; EG-NEXT: TEX 0 @6
451; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
452; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
453; EG-NEXT: CF_END
454; EG-NEXT: PAD
455; EG-NEXT: Fetch clause starting at 6:
456; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
457; EG-NEXT: ALU clause starting at 8:
458; EG-NEXT: MOV * T0.X, KC0[2].Z,
459; EG-NEXT: ALU clause starting at 9:
460; EG-NEXT: OR_INT * T0.W, T0.X, literal.x,
461; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00)
462; EG-NEXT: FFBL_INT T0.W, PV.W,
463; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
464; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
465; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
466; EG-NEXT: LSHL * T1.W, PS, literal.y,
467; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
468; EG-NEXT: LSHL T0.X, PV.W, PS,
469; EG-NEXT: LSHL * T0.W, literal.x, PS,
470; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
471; EG-NEXT: MOV T0.Y, 0.0,
472; EG-NEXT: MOV * T0.Z, 0.0,
473; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
474; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
475;
476; GFX10-LABEL: v_cttz_i8:
477; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900478; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100479; GFX10-NEXT: v_mov_b32_e32 v0, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100480; GFX10-NEXT: s_waitcnt lgkmcnt(0)
481; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
482; GFX10-NEXT: s_waitcnt vmcnt(0)
483; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1
484; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
485; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
486; GFX10-NEXT: s_endpgm
487;
488; GFX10-GISEL-LABEL: v_cttz_i8:
489; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900490; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100491; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100492; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
493; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
494; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad57b91072021-08-06 11:05:42 +0100495; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100496; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
497; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
498; GFX10-GISEL-NEXT: s_endpgm
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500499 %val = load i8, ptr addrspace(1) %valptr
Jay Foadba5c4ac2021-08-03 17:13:02 +0100500 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500501 store i8 %cttz, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +0100502 ret void
503}
504
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500505define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100506; SI-LABEL: s_cttz_i64:
507; SI: ; %bb.0:
508; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
509; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
510; SI-NEXT: s_mov_b32 s3, 0xf000
511; SI-NEXT: s_mov_b32 s2, -1
512; SI-NEXT: s_waitcnt lgkmcnt(0)
Jay Foade6c364a2021-08-05 09:58:29 +0100513; SI-NEXT: s_ff1_i32_b32 s5, s5
514; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf
515; SI-NEXT: s_add_i32 s5, s5, 32
516; SI-NEXT: s_ff1_i32_b32 s4, s4
517; SI-NEXT: v_mov_b32_e32 v0, s5
518; SI-NEXT: v_min3_u32 v0, s4, v0, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100519; SI-NEXT: v_mov_b32_e32 v1, 0
520; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
521; SI-NEXT: s_endpgm
522;
523; VI-LABEL: s_cttz_i64:
524; VI: ; %bb.0:
Austin Kerbowda067ed2021-11-10 09:59:31 -0800525; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
526; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
527; VI-NEXT: s_mov_b32 s3, 0xf000
528; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100529; VI-NEXT: v_mov_b32_e32 v1, 0
530; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800531; VI-NEXT: s_ff1_i32_b32 s5, s5
532; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp
533; VI-NEXT: s_ff1_i32_b32 s4, s4
534; VI-NEXT: v_min3_u32 v0, s4, v0, 64
535; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100536; VI-NEXT: s_endpgm
537;
538; EG-LABEL: s_cttz_i64:
539; EG: ; %bb.0:
540; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
541; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
542; EG-NEXT: CF_END
543; EG-NEXT: PAD
544; EG-NEXT: ALU clause starting at 4:
545; EG-NEXT: FFBL_INT * T0.W, KC0[5].X,
546; EG-NEXT: CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W,
547; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
548; EG-NEXT: FFBL_INT T1.W, KC0[4].W,
549; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
550; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
551; EG-NEXT: CNDE_INT T0.X, KC0[4].W, PS, PV.W,
552; EG-NEXT: MOV T0.Y, 0.0,
553; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
554; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
555;
556; GFX10-LABEL: s_cttz_i64:
557; GFX10: ; %bb.0:
558; GFX10-NEXT: s_clause 0x1
559; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
560; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
561; GFX10-NEXT: v_mov_b32_e32 v1, 0
562; GFX10-NEXT: s_waitcnt lgkmcnt(0)
563; GFX10-NEXT: s_ff1_i32_b32 s0, s3
Jay Foade6c364a2021-08-05 09:58:29 +0100564; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp
565; GFX10-NEXT: s_ff1_i32_b32 s0, s2
566; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100567; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
568; GFX10-NEXT: s_endpgm
569;
570; GFX10-GISEL-LABEL: s_cttz_i64:
571; GFX10-GISEL: ; %bb.0:
572; GFX10-GISEL-NEXT: s_clause 0x1
573; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
574; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
Jay Foad342642d2021-08-06 13:09:47 +0100575; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100576; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
577; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
578; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3]
Jay Foad83610d42021-08-03 17:11:08 +0100579; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100580; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
581; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
582; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
583; GFX10-GISEL-NEXT: s_endpgm
584 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500585 store i64 %cttz, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +0100586 ret void
587}
588
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500589define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100590; SI-LABEL: s_cttz_i64_trunc:
591; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900592; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
593; SI-NEXT: s_mov_b32 s7, 0xf000
594; SI-NEXT: s_mov_b32 s6, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100595; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900596; SI-NEXT: s_mov_b32 s4, s0
597; SI-NEXT: s_mov_b32 s5, s1
598; SI-NEXT: s_ff1_i32_b32 s0, s3
599; SI-NEXT: s_min_u32 s0, s0, 0xffffffdf
600; SI-NEXT: s_add_i32 s0, s0, 32
601; SI-NEXT: s_ff1_i32_b32 s1, s2
602; SI-NEXT: v_mov_b32_e32 v0, s0
603; SI-NEXT: v_min3_u32 v0, s1, v0, 64
604; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100605; SI-NEXT: s_endpgm
606;
607; VI-LABEL: s_cttz_i64_trunc:
608; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900609; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
610; VI-NEXT: s_mov_b32 s7, 0xf000
611; VI-NEXT: s_mov_b32 s6, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100612; VI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900613; VI-NEXT: s_mov_b32 s4, s0
614; VI-NEXT: s_ff1_i32_b32 s0, s3
615; VI-NEXT: s_mov_b32 s5, s1
616; VI-NEXT: v_add_u32_e64 v0, s[0:1], s0, 32 clamp
617; VI-NEXT: s_ff1_i32_b32 s0, s2
618; VI-NEXT: v_min3_u32 v0, s0, v0, 64
619; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100620; VI-NEXT: s_endpgm
621;
622; EG-LABEL: s_cttz_i64_trunc:
623; EG: ; %bb.0:
624; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
625; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
626; EG-NEXT: CF_END
627; EG-NEXT: PAD
628; EG-NEXT: ALU clause starting at 4:
629; EG-NEXT: FFBL_INT * T0.W, KC0[3].X,
630; EG-NEXT: CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
631; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
632; EG-NEXT: FFBL_INT T1.W, KC0[2].W,
633; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
634; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
635; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W,
636; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
637; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
638;
639; GFX10-LABEL: s_cttz_i64_trunc:
640; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900641; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foade6c364a2021-08-05 09:58:29 +0100642; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100643; GFX10-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900644; GFX10-NEXT: s_ff1_i32_b32 s3, s3
645; GFX10-NEXT: s_ff1_i32_b32 s2, s2
646; GFX10-NEXT: v_add_nc_u32_e64 v0, s3, 32 clamp
647; GFX10-NEXT: v_min3_u32 v0, s2, v0, 64
648; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100649; GFX10-NEXT: s_endpgm
650;
651; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
652; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900653; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100654; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
655; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900656; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3]
657; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64
658; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
659; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100660; GFX10-GISEL-NEXT: s_endpgm
661 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
662 %trunc = trunc i64 %cttz to i32
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500663 store i32 %trunc, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +0100664 ret void
665}
666
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500667define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100668; SI-LABEL: v_cttz_i64:
669; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900670; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Jay Foadba5c4ac2021-08-03 17:13:02 +0100671; SI-NEXT: s_mov_b32 s7, 0xf000
672; SI-NEXT: s_mov_b32 s6, 0
673; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
674; SI-NEXT: v_mov_b32_e32 v1, 0
675; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900676; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100677; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
Carl Ritson4c4db812022-07-30 11:13:20 +0900678; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100679; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foade6c364a2021-08-05 09:58:29 +0100680; SI-NEXT: v_ffbl_b32_e32 v3, v3
681; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3
682; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3
683; SI-NEXT: v_ffbl_b32_e32 v2, v2
684; SI-NEXT: v_min3_u32 v2, v2, v3, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100685; SI-NEXT: v_mov_b32_e32 v3, v1
Carl Ritson4c4db812022-07-30 11:13:20 +0900686; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100687; SI-NEXT: s_endpgm
688;
689; VI-LABEL: v_cttz_i64:
690; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900691; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100692; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100693; VI-NEXT: v_mov_b32_e32 v2, 0
694; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800695; VI-NEXT: v_mov_b32_e32 v1, s3
696; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3
Jay Foade6c364a2021-08-05 09:58:29 +0100697; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
Jay Foadba5c4ac2021-08-03 17:13:02 +0100698; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800699; VI-NEXT: v_mov_b32_e32 v4, s1
700; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
Jay Foade6c364a2021-08-05 09:58:29 +0100701; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
Jay Foadba5c4ac2021-08-03 17:13:02 +0100702; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foade6c364a2021-08-05 09:58:29 +0100703; VI-NEXT: v_ffbl_b32_e32 v1, v1
704; VI-NEXT: v_add_u32_e64 v1, s[0:1], v1, 32 clamp
705; VI-NEXT: v_ffbl_b32_e32 v0, v0
706; VI-NEXT: v_min3_u32 v1, v0, v1, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100707; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
708; VI-NEXT: s_endpgm
709;
710; EG-LABEL: v_cttz_i64:
711; EG: ; %bb.0:
712; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
713; EG-NEXT: TEX 0 @6
714; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
715; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
716; EG-NEXT: CF_END
717; EG-NEXT: PAD
718; EG-NEXT: Fetch clause starting at 6:
719; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
720; EG-NEXT: ALU clause starting at 8:
721; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
722; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
723; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
724; EG-NEXT: ALU clause starting at 11:
725; EG-NEXT: FFBL_INT * T1.W, T0.Y,
726; EG-NEXT: CNDE_INT * T1.W, T0.Y, literal.x, PV.W,
727; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
728; EG-NEXT: FFBL_INT T2.W, T0.X,
729; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
730; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
731; EG-NEXT: CNDE_INT T0.X, T0.X, PS, PV.W,
732; EG-NEXT: MOV T0.Y, 0.0,
733; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
734; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
735; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
736;
737; GFX10-LABEL: v_cttz_i64:
738; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900739; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100740; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100741; GFX10-NEXT: s_waitcnt lgkmcnt(0)
742; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
743; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foade6c364a2021-08-05 09:58:29 +0100744; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
745; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
746; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
747; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100748; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100749; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
750; GFX10-NEXT: s_endpgm
751;
752; GFX10-GISEL-LABEL: v_cttz_i64:
753; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900754; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foad24b67a902021-08-04 11:55:29 +0100755; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100756; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
Jay Foad24b67a902021-08-04 11:55:29 +0100757; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100758; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +0100759; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
760; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
761; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
762; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1
Jay Foad24b67a902021-08-04 11:55:29 +0100763; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foad83610d42021-08-03 17:11:08 +0100764; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0
Jay Foad24b67a902021-08-04 11:55:29 +0100765; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100766; GFX10-GISEL-NEXT: s_endpgm
767 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500768 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
769 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
770 %val = load i64, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +0100771 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500772 store i64 %cttz, ptr addrspace(1) %out.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +0100773 ret void
774}
775
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500776define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100777; SI-LABEL: v_cttz_i64_trunc:
778; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900779; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
Jay Foadba5c4ac2021-08-03 17:13:02 +0100780; SI-NEXT: s_mov_b32 s7, 0xf000
781; SI-NEXT: s_mov_b32 s6, 0
782; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
783; SI-NEXT: v_mov_b32_e32 v2, 0
784; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900785; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100786; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100787; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900788; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
Jay Foadba5c4ac2021-08-03 17:13:02 +0100789; SI-NEXT: s_waitcnt vmcnt(0)
790; SI-NEXT: v_ffbl_b32_e32 v0, v4
Jay Foade6c364a2021-08-05 09:58:29 +0100791; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100792; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
Jay Foade6c364a2021-08-05 09:58:29 +0100793; SI-NEXT: v_ffbl_b32_e32 v3, v3
794; SI-NEXT: v_min3_u32 v0, v3, v0, 64
Carl Ritson4c4db812022-07-30 11:13:20 +0900795; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100796; SI-NEXT: s_endpgm
797;
798; VI-LABEL: v_cttz_i64_trunc:
799; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900800; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100801; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100802; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
803; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800804; VI-NEXT: v_mov_b32_e32 v2, s3
805; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
Jay Foade6c364a2021-08-05 09:58:29 +0100806; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
Jay Foadba5c4ac2021-08-03 17:13:02 +0100807; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800808; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
809; VI-NEXT: v_mov_b32_e32 v4, s1
Jay Foade6c364a2021-08-05 09:58:29 +0100810; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
Jay Foadba5c4ac2021-08-03 17:13:02 +0100811; VI-NEXT: s_waitcnt vmcnt(0)
812; VI-NEXT: v_ffbl_b32_e32 v0, v2
Jay Foade6c364a2021-08-05 09:58:29 +0100813; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp
814; VI-NEXT: v_ffbl_b32_e32 v1, v1
815; VI-NEXT: v_min3_u32 v0, v1, v0, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100816; VI-NEXT: flat_store_dword v[3:4], v0
817; VI-NEXT: s_endpgm
818;
819; EG-LABEL: v_cttz_i64_trunc:
820; EG: ; %bb.0:
821; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
822; EG-NEXT: TEX 0 @6
823; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
824; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
825; EG-NEXT: CF_END
826; EG-NEXT: PAD
827; EG-NEXT: Fetch clause starting at 6:
828; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
829; EG-NEXT: ALU clause starting at 8:
830; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
831; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
832; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
833; EG-NEXT: ALU clause starting at 11:
834; EG-NEXT: FFBL_INT * T0.W, T1.Y,
835; EG-NEXT: CNDE_INT * T0.W, T1.Y, literal.x, PV.W,
836; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
837; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
838; EG-NEXT: FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212
839; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
840; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
841; EG-NEXT: CNDE_INT T0.X, T1.X, PS, PV.W,
842; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
843; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
844; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
845;
846; GFX10-LABEL: v_cttz_i64_trunc:
847; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900848; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100849; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100850; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
851; GFX10-NEXT: s_waitcnt lgkmcnt(0)
852; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
853; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foade6c364a2021-08-05 09:58:29 +0100854; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
855; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
856; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
857; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64
Jay Foadba5c4ac2021-08-03 17:13:02 +0100858; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
859; GFX10-NEXT: s_endpgm
860;
861; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
862; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900863; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100864; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100865; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
866; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
867; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
868; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +0100869; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
870; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
871; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
872; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2
873; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100874; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
875; GFX10-GISEL-NEXT: s_endpgm
876 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500877 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
878 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
879 %val = load i64, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +0100880 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
881 %trunc = trunc i64 %cttz to i32
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500882 store i32 %trunc, ptr addrspace(1) %out.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +0100883 ret void
884}
885
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500886define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100887; SI-LABEL: v_cttz_i32_sel_eq_neg1:
888; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900889; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
890; SI-NEXT: s_mov_b32 s7, 0xf000
891; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100892; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
893; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +0900894; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100895; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900896; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
897; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
898; SI-NEXT: s_mov_b32 s6, -1
899; SI-NEXT: s_mov_b32 s4, s0
900; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100901; SI-NEXT: s_waitcnt vmcnt(0)
902; SI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900903; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100904; SI-NEXT: s_endpgm
905;
906; VI-LABEL: v_cttz_i32_sel_eq_neg1:
907; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900908; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100909; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100910; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -0800911; VI-NEXT: v_mov_b32_e32 v1, s3
912; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100913; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
914; VI-NEXT: flat_load_dword v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -0800915; VI-NEXT: s_mov_b32 s3, 0xf000
916; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100917; VI-NEXT: s_waitcnt vmcnt(0)
918; VI-NEXT: v_ffbl_b32_e32 v0, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -0800919; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100920; VI-NEXT: s_endpgm
921;
922; EG-LABEL: v_cttz_i32_sel_eq_neg1:
923; EG: ; %bb.0:
924; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
925; EG-NEXT: TEX 0 @6
926; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
927; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
928; EG-NEXT: CF_END
929; EG-NEXT: PAD
930; EG-NEXT: Fetch clause starting at 6:
931; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
932; EG-NEXT: ALU clause starting at 8:
933; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
934; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
935; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
936; EG-NEXT: ALU clause starting at 11:
937; EG-NEXT: FFBL_INT * T0.W, T0.X,
938; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
939; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
940; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
941; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
942; EG-NEXT: -1(nan), 2(2.802597e-45)
943;
944; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
945; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900946; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100947; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100948; GFX10-NEXT: v_mov_b32_e32 v1, 0
949; GFX10-NEXT: s_waitcnt lgkmcnt(0)
950; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
951; GFX10-NEXT: s_waitcnt vmcnt(0)
952; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
953; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
954; GFX10-NEXT: s_endpgm
955;
956; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
957; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900958; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +0100959; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100960; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
961; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
962; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
963; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0
964; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
Jay Foad83610d42021-08-03 17:11:08 +0100965; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
966; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
Jay Foadba5c4ac2021-08-03 17:13:02 +0100967; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100968; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
969; GFX10-GISEL-NEXT: s_endpgm
970 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500971 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
972 %val = load i32, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +0100973 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
974 %cmp = icmp eq i32 %val, 0
975 %sel = select i1 %cmp, i32 -1, i32 %cttz
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500976 store i32 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +0100977 ret void
978}
979
Matt Arsenaultb5bc2052022-11-29 18:26:06 -0500980define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +0100981; SI-LABEL: v_cttz_i32_sel_ne_neg1:
982; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +0900983; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
984; SI-NEXT: s_mov_b32 s7, 0xf000
985; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100986; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
987; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +0900988; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +0100989; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +0900990; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
991; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
992; SI-NEXT: s_mov_b32 s6, -1
993; SI-NEXT: s_mov_b32 s4, s0
994; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +0100995; SI-NEXT: s_waitcnt vmcnt(0)
996; SI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +0900997; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +0100998; SI-NEXT: s_endpgm
999;
1000; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1001; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001002; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001003; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001004; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08001005; VI-NEXT: v_mov_b32_e32 v1, s3
1006; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001007; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1008; VI-NEXT: flat_load_dword v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -08001009; VI-NEXT: s_mov_b32 s3, 0xf000
1010; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001011; VI-NEXT: s_waitcnt vmcnt(0)
1012; VI-NEXT: v_ffbl_b32_e32 v0, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -08001013; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001014; VI-NEXT: s_endpgm
1015;
1016; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1017; EG: ; %bb.0:
1018; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1019; EG-NEXT: TEX 0 @6
1020; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
1021; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1022; EG-NEXT: CF_END
1023; EG-NEXT: PAD
1024; EG-NEXT: Fetch clause starting at 6:
1025; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1026; EG-NEXT: ALU clause starting at 8:
1027; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1028; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1029; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1030; EG-NEXT: ALU clause starting at 11:
1031; EG-NEXT: FFBL_INT * T0.W, T0.X,
1032; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1033; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1034; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1035; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1036; EG-NEXT: -1(nan), 2(2.802597e-45)
1037;
1038; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
1039; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001040; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001041; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001042; GFX10-NEXT: v_mov_b32_e32 v1, 0
1043; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1044; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1045; GFX10-NEXT: s_waitcnt vmcnt(0)
1046; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1047; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1048; GFX10-NEXT: s_endpgm
1049;
1050; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1051; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001052; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001053; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001054; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1055; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1056; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1057; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001058; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
Jay Foad83610d42021-08-03 17:11:08 +01001059; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001060; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1061; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1062; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1063; GFX10-GISEL-NEXT: s_endpgm
1064 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001065 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1066 %val = load i32, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +01001067 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1068 %cmp = icmp ne i32 %val, 0
1069 %sel = select i1 %cmp, i32 %cttz, i32 -1
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001070 store i32 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001071 ret void
1072}
1073
1074; TODO: Should be able to eliminate select here as well.
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001075define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +01001076; SI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1077; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001078; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1079; SI-NEXT: s_mov_b32 s7, 0xf000
1080; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001081; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1082; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +09001083; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001084; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001085; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1086; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1087; SI-NEXT: s_mov_b32 s6, -1
1088; SI-NEXT: s_mov_b32 s4, s0
1089; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001090; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001091; SI-NEXT: v_ffbl_b32_e32 v0, v0
1092; SI-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001093; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1094; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
Carl Ritson4c4db812022-07-30 11:13:20 +09001095; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001096; SI-NEXT: s_endpgm
1097;
1098; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1099; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001100; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001101; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001102; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08001103; VI-NEXT: v_mov_b32_e32 v1, s3
1104; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001105; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1106; VI-NEXT: flat_load_dword v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -08001107; VI-NEXT: s_mov_b32 s3, 0xf000
1108; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001109; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001110; VI-NEXT: v_ffbl_b32_e32 v0, v0
1111; VI-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001112; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1113; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
Austin Kerbowda067ed2021-11-10 09:59:31 -08001114; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001115; VI-NEXT: s_endpgm
1116;
1117; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
1118; EG: ; %bb.0:
1119; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1120; EG-NEXT: TEX 0 @6
1121; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1123; EG-NEXT: CF_END
1124; EG-NEXT: PAD
1125; EG-NEXT: Fetch clause starting at 6:
1126; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1127; EG-NEXT: ALU clause starting at 8:
1128; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1129; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1130; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1131; EG-NEXT: ALU clause starting at 11:
1132; EG-NEXT: FFBL_INT * T0.W, T0.X,
1133; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1134; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1135; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x,
1136; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1137; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x,
1138; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1139; EG-NEXT: -1(nan), 2(2.802597e-45)
1140;
1141; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
1142; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001143; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001144; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad2b639332021-08-05 14:32:25 +01001145; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001146; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1147; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1148; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001149; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1150; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001151; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1152; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1153; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1154; GFX10-NEXT: s_endpgm
1155;
1156; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
1157; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001158; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001159; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad83610d42021-08-03 17:11:08 +01001160; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001161; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1162; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1163; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +01001164; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
1165; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001166; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0
1167; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1168; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1169; GFX10-GISEL-NEXT: s_endpgm
1170 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001171 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1172 %val = load i32, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +01001173 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1174 %cmp = icmp eq i32 %cttz, 32
1175 %sel = select i1 %cmp, i32 -1, i32 %cttz
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001176 store i32 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001177 ret void
1178}
1179
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001180define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +01001181; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1182; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001183; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1184; SI-NEXT: s_mov_b32 s7, 0xf000
1185; SI-NEXT: s_mov_b32 s10, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001186; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1187; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +09001188; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001189; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001190; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1191; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1192; SI-NEXT: s_mov_b32 s6, -1
1193; SI-NEXT: s_mov_b32 s4, s0
1194; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001195; SI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001196; SI-NEXT: v_ffbl_b32_e32 v0, v0
1197; SI-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001198; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1199; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
Carl Ritson4c4db812022-07-30 11:13:20 +09001200; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001201; SI-NEXT: s_endpgm
1202;
1203; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1204; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001205; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001206; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001207; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08001208; VI-NEXT: v_mov_b32_e32 v1, s3
1209; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001210; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1211; VI-NEXT: flat_load_dword v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -08001212; VI-NEXT: s_mov_b32 s3, 0xf000
1213; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001214; VI-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001215; VI-NEXT: v_ffbl_b32_e32 v0, v0
1216; VI-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001217; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1218; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
Austin Kerbowda067ed2021-11-10 09:59:31 -08001219; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001220; VI-NEXT: s_endpgm
1221;
1222; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1223; EG: ; %bb.0:
1224; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1225; EG-NEXT: TEX 0 @6
1226; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1227; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1228; EG-NEXT: CF_END
1229; EG-NEXT: PAD
1230; EG-NEXT: Fetch clause starting at 6:
1231; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1232; EG-NEXT: ALU clause starting at 8:
1233; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1234; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1235; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1236; EG-NEXT: ALU clause starting at 11:
1237; EG-NEXT: FFBL_INT * T0.W, T0.X,
1238; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1239; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1240; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
1241; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1242; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
1243; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1244; EG-NEXT: -1(nan), 2(2.802597e-45)
1245;
1246; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
1247; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001248; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001249; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad2b639332021-08-05 14:32:25 +01001250; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001251; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1252; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1253; GFX10-NEXT: s_waitcnt vmcnt(0)
Jay Foad2b639332021-08-05 14:32:25 +01001254; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1255; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001256; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1257; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1258; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1259; GFX10-NEXT: s_endpgm
1260;
1261; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1262; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001263; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001264; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
Jay Foad83610d42021-08-03 17:11:08 +01001265; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001266; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1267; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1268; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad83610d42021-08-03 17:11:08 +01001269; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
1270; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001271; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1272; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1273; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1274; GFX10-GISEL-NEXT: s_endpgm
1275 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001276 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1277 %val = load i32, ptr addrspace(1) %in.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +01001278 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1279 %cmp = icmp ne i32 %cttz, 32
1280 %sel = select i1 %cmp, i32 %cttz, i32 -1
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001281 store i32 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001282 ret void
1283}
1284
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001285 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +01001286; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1287; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001288; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1289; SI-NEXT: s_mov_b32 s7, 0xf000
Jay Foadba5c4ac2021-08-03 17:13:02 +01001290; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +09001291; SI-NEXT: s_mov_b32 s10, 0
1292; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001293; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001294; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1295; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1296; SI-NEXT: s_mov_b32 s6, -1
1297; SI-NEXT: s_mov_b32 s4, s0
1298; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001299; SI-NEXT: s_waitcnt vmcnt(0)
1300; SI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +09001301; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001302; SI-NEXT: s_endpgm
1303;
1304; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1305; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001306; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001307; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08001308; VI-NEXT: v_mov_b32_e32 v1, s3
1309; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001310; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1311; VI-NEXT: flat_load_ubyte v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -08001312; VI-NEXT: s_mov_b32 s3, 0xf000
1313; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001314; VI-NEXT: s_waitcnt vmcnt(0)
1315; VI-NEXT: v_ffbl_b32_e32 v0, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -08001316; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001317; VI-NEXT: s_endpgm
1318;
1319; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1320; EG: ; %bb.0:
1321; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1322; EG-NEXT: TEX 0 @6
1323; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1324; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1325; EG-NEXT: CF_END
1326; EG-NEXT: PAD
1327; EG-NEXT: Fetch clause starting at 6:
1328; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1329; EG-NEXT: ALU clause starting at 8:
1330; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1331; EG-NEXT: ALU clause starting at 9:
1332; EG-NEXT: FFBL_INT T0.W, T0.X,
1333; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1334; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1335; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1336; EG-NEXT: LSHL * T1.W, PS, literal.y,
1337; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1338; EG-NEXT: LSHL T0.X, PV.W, PS,
1339; EG-NEXT: LSHL * T0.W, literal.x, PS,
1340; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1341; EG-NEXT: MOV T0.Y, 0.0,
1342; EG-NEXT: MOV * T0.Z, 0.0,
1343; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1344; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1345;
1346; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
1347; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001348; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001349; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001350; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1351; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1352; GFX10-NEXT: s_waitcnt vmcnt(0)
1353; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1354; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1355; GFX10-NEXT: s_endpgm
1356;
1357; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1358; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001359; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001360; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001361; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1362; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1363; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
1364; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1365; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
Jay Foad57b91072021-08-06 11:05:42 +01001366; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001367; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1368; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foadba5c4ac2021-08-03 17:13:02 +01001369; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0
Jay Foad57b91072021-08-06 11:05:42 +01001370; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
Jay Foadba5c4ac2021-08-03 17:13:02 +01001371; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
Jay Foad57b91072021-08-06 11:05:42 +01001372; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2
1373; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1]
Jay Foadba5c4ac2021-08-03 17:13:02 +01001374; GFX10-GISEL-NEXT: s_endpgm
1375 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001376 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1377 %val = load i8, ptr addrspace(1) %valptr.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +01001378 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1379 %cmp = icmp eq i8 %val, 0
1380 %sel = select i1 %cmp, i8 -1, i8 %cttz
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001381 store i8 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001382 ret void
1383}
1384
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001385 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +01001386; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1387; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001388; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1389; SI-NEXT: s_mov_b32 s7, 0xf000
1390; SI-NEXT: s_mov_b32 s6, -1
1391; SI-NEXT: s_mov_b32 s10, s6
1392; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001393; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001394; SI-NEXT: s_mov_b32 s8, s2
1395; SI-NEXT: s_mov_b32 s9, s3
1396; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1397; SI-NEXT: s_mov_b32 s4, s0
1398; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001399; SI-NEXT: s_waitcnt vmcnt(0)
1400; SI-NEXT: v_ffbl_b32_e32 v0, v0
Carl Ritson4c4db812022-07-30 11:13:20 +09001401; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001402; SI-NEXT: s_endpgm
1403;
1404; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1405; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001406; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1407; VI-NEXT: s_mov_b32 s7, 0xf000
1408; VI-NEXT: s_mov_b32 s6, -1
1409; VI-NEXT: s_mov_b32 s10, s6
1410; VI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001411; VI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001412; VI-NEXT: s_mov_b32 s8, s2
1413; VI-NEXT: s_mov_b32 s9, s3
1414; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001415; VI-NEXT: v_mov_b32_e32 v1, 0xffff
Carl Ritson4c4db812022-07-30 11:13:20 +09001416; VI-NEXT: s_mov_b32 s4, s0
1417; VI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001418; VI-NEXT: s_waitcnt vmcnt(0)
1419; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
Jay Foad2b639332021-08-05 14:32:25 +01001420; VI-NEXT: v_ffbl_b32_e32 v2, v2
1421; VI-NEXT: v_min_u32_e32 v2, 32, v2
Jay Foadba5c4ac2021-08-03 17:13:02 +01001422; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1423; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
Carl Ritson4c4db812022-07-30 11:13:20 +09001424; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001425; VI-NEXT: s_endpgm
1426;
1427; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1428; EG: ; %bb.0:
1429; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1430; EG-NEXT: TEX 0 @6
1431; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1432; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1433; EG-NEXT: CF_END
1434; EG-NEXT: PAD
1435; EG-NEXT: Fetch clause starting at 6:
1436; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1437; EG-NEXT: ALU clause starting at 8:
1438; EG-NEXT: MOV * T0.X, KC0[2].Z,
1439; EG-NEXT: ALU clause starting at 9:
1440; EG-NEXT: FFBL_INT T0.W, T0.X,
1441; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1442; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1443; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1444; EG-NEXT: LSHL * T1.W, PS, literal.y,
1445; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1446; EG-NEXT: LSHL T0.X, PV.W, PS,
1447; EG-NEXT: LSHL * T0.W, literal.x, PS,
1448; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1449; EG-NEXT: MOV T0.Y, 0.0,
1450; EG-NEXT: MOV * T0.Z, 0.0,
1451; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1452; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1453;
1454; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
1455; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001456; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001457; GFX10-NEXT: v_mov_b32_e32 v0, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001458; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1459; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1460; GFX10-NEXT: s_waitcnt vmcnt(0)
1461; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001462; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
Jay Foad2b639332021-08-05 14:32:25 +01001463; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
1464; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
Jay Foadba5c4ac2021-08-03 17:13:02 +01001465; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1466; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1467; GFX10-NEXT: s_endpgm
1468;
1469; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1470; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001471; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001472; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001473; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1474; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
Jay Foadba5c4ac2021-08-03 17:13:02 +01001475; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1476; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
1477; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
1478; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
Jay Foad3eb22812022-05-16 15:48:11 +01001479; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
1480; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
Jay Foadba5c4ac2021-08-03 17:13:02 +01001481; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
1482; GFX10-GISEL-NEXT: s_endpgm
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001483 %val = load i16, ptr addrspace(1) %valptr
Jay Foadba5c4ac2021-08-03 17:13:02 +01001484 %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1485 %cmp = icmp eq i16 %val, 0
1486 %sel = select i1 %cmp, i16 -1, i16 %cttz
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001487 store i16 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001488 ret void
1489}
1490
1491; FIXME: Need to handle non-uniform case for function below (load without gep).
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001492define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
Jay Foadba5c4ac2021-08-03 17:13:02 +01001493; SI-LABEL: v_cttz_i7_sel_eq_neg1:
1494; SI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001495; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1496; SI-NEXT: s_mov_b32 s7, 0xf000
Jay Foadba5c4ac2021-08-03 17:13:02 +01001497; SI-NEXT: v_mov_b32_e32 v1, 0
Carl Ritson4c4db812022-07-30 11:13:20 +09001498; SI-NEXT: s_mov_b32 s10, 0
1499; SI-NEXT: s_mov_b32 s11, s7
Jay Foadba5c4ac2021-08-03 17:13:02 +01001500; SI-NEXT: s_waitcnt lgkmcnt(0)
Carl Ritson4c4db812022-07-30 11:13:20 +09001501; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1502; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1503; SI-NEXT: s_mov_b32 s6, -1
1504; SI-NEXT: s_mov_b32 s4, s0
1505; SI-NEXT: s_mov_b32 s5, s1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001506; SI-NEXT: s_waitcnt vmcnt(0)
1507; SI-NEXT: v_ffbl_b32_e32 v0, v0
1508; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
Carl Ritson4c4db812022-07-30 11:13:20 +09001509; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001510; SI-NEXT: s_endpgm
1511;
1512; VI-LABEL: v_cttz_i7_sel_eq_neg1:
1513; VI: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001514; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001515; VI-NEXT: s_waitcnt lgkmcnt(0)
Austin Kerbowda067ed2021-11-10 09:59:31 -08001516; VI-NEXT: v_mov_b32_e32 v1, s3
1517; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001518; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1519; VI-NEXT: flat_load_ubyte v0, v[0:1]
Austin Kerbowda067ed2021-11-10 09:59:31 -08001520; VI-NEXT: s_mov_b32 s3, 0xf000
1521; VI-NEXT: s_mov_b32 s2, -1
Jay Foadba5c4ac2021-08-03 17:13:02 +01001522; VI-NEXT: s_waitcnt vmcnt(0)
1523; VI-NEXT: v_ffbl_b32_e32 v0, v0
1524; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
Austin Kerbowda067ed2021-11-10 09:59:31 -08001525; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001526; VI-NEXT: s_endpgm
1527;
1528; EG-LABEL: v_cttz_i7_sel_eq_neg1:
1529; EG: ; %bb.0:
1530; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1531; EG-NEXT: TEX 0 @6
1532; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1533; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1534; EG-NEXT: CF_END
1535; EG-NEXT: PAD
1536; EG-NEXT: Fetch clause starting at 6:
1537; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1538; EG-NEXT: ALU clause starting at 8:
1539; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1540; EG-NEXT: ALU clause starting at 9:
1541; EG-NEXT: FFBL_INT T0.W, T0.X,
1542; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1543; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1544; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1545; EG-NEXT: LSHL * T1.W, PS, literal.y,
1546; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45)
1547; EG-NEXT: LSHL T0.X, PV.W, PS,
1548; EG-NEXT: LSHL * T0.W, literal.x, PS,
1549; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1550; EG-NEXT: MOV T0.Y, 0.0,
1551; EG-NEXT: MOV * T0.Z, 0.0,
1552; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1553; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1554;
1555; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
1556; GFX10: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001557; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001558; GFX10-NEXT: v_mov_b32_e32 v1, 0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001559; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1560; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1561; GFX10-NEXT: s_waitcnt vmcnt(0)
1562; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1563; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0
1564; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1565; GFX10-NEXT: s_endpgm
1566;
1567; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
1568; GFX10-GISEL: ; %bb.0:
Carl Ritson4c4db812022-07-30 11:13:20 +09001569; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
Jay Foadba5c4ac2021-08-03 17:13:02 +01001570; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001571; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1572; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1573; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
Jay Foadba5c4ac2021-08-03 17:13:02 +01001574; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1575; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1576; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1577; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
Jay Foad57b91072021-08-06 11:05:42 +01001578; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0
Jay Foad3eb22812022-05-16 15:48:11 +01001579; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001580; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
1581; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1582; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1583; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
Jay Foad3eb22812022-05-16 15:48:11 +01001584; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
Jay Foadba5c4ac2021-08-03 17:13:02 +01001585; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1586; GFX10-GISEL-NEXT: s_endpgm
1587 %tid = call i32 @llvm.amdgcn.workitem.id.x()
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001588 %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
1589 %val = load i7, ptr addrspace(1) %valptr.gep
Jay Foadba5c4ac2021-08-03 17:13:02 +01001590 %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone
1591 %cmp = icmp eq i7 %val, 0
1592 %sel = select i1 %cmp, i7 -1, i7 %cttz
Matt Arsenaultb5bc2052022-11-29 18:26:06 -05001593 store i7 %sel, ptr addrspace(1) %out
Jay Foadba5c4ac2021-08-03 17:13:02 +01001594 ret void
1595}