blob: 1e0ac3807528000e7ea1134770ef31d34746bbbb [file] [log] [blame]
Sam Kolton9fa16962017-04-06 15:03:28 +00001; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
2; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
Sam Koltonf60ad582017-03-21 12:51:34 +00003
4; GCN-LABEL: {{^}}add_shr_i32:
5; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
6; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
7; NOSDWA-NOT: v_add_i32_sdwa
8
9; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000011define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
Sam Koltonf60ad582017-03-21 12:51:34 +000012 %a = load i32, i32 addrspace(1)* %in, align 4
13 %shr = lshr i32 %a, 16
14 %add = add i32 %a, %shr
15 store i32 %add, i32 addrspace(1)* %out, align 4
16 ret void
17}
18
19; GCN-LABEL: {{^}}sub_shr_i32:
20; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
21; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
22; NOSDWA-NOT: v_subrev_i32_sdwa
23
24; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
25
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000026define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
Sam Koltonf60ad582017-03-21 12:51:34 +000027 %a = load i32, i32 addrspace(1)* %in, align 4
28 %shr = lshr i32 %a, 16
29 %sub = sub i32 %shr, %a
30 store i32 %sub, i32 addrspace(1)* %out, align 4
31 ret void
32}
33
34; GCN-LABEL: {{^}}mul_shr_i32:
35; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
36; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
37; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
38; NOSDWA-NOT: v_mul_u32_u24_sdwa
39
40; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000042define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
Sam Koltonf60ad582017-03-21 12:51:34 +000043 %a = load i32, i32 addrspace(1)* %in1, align 4
44 %b = load i32, i32 addrspace(1)* %in2, align 4
45 %shra = lshr i32 %a, 16
46 %shrb = lshr i32 %b, 16
47 %mul = mul i32 %shra, %shrb
48 store i32 %mul, i32 addrspace(1)* %out, align 4
49 ret void
50}
51
52; GCN-LABEL: {{^}}mul_i16:
53; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
54; NOSDWA-NOT: v_mul_u32_u24_sdwa
55; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
56; SDWA-NOT: v_mul_u32_u24_sdwa
57
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000058define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +000059entry:
60 %a = load i16, i16 addrspace(1)* %ina, align 4
61 %b = load i16, i16 addrspace(1)* %inb, align 4
62 %mul = mul i16 %a, %b
63 store i16 %mul, i16 addrspace(1)* %out, align 4
64 ret void
65}
66
67; GCN-LABEL: {{^}}mul_v2i16:
68; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
69; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
70; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
71; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
72; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
73; NOSDWA-NOT: v_mul_u32_u24_sdwa
74
Sam Kolton9fa16962017-04-06 15:03:28 +000075; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
76; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
Sam Kolton27e0f8b2017-03-31 11:42:43 +000077; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
Sam Koltonf60ad582017-03-21 12:51:34 +000078
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000079define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +000080entry:
81 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
82 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
83 %mul = mul <2 x i16> %a, %b
84 store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
85 ret void
86}
87
88; GCN-LABEL: {{^}}mul_v4i16:
89; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
90; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
91; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
92; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
93; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
94; NOSDWA-NOT: v_mul_u32_u24_sdwa
95
Sam Kolton9fa16962017-04-06 15:03:28 +000096; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
97; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
98; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
99; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
100; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
101; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
Sam Koltonf60ad582017-03-21 12:51:34 +0000102
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000103define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000104entry:
105 %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
106 %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
107 %mul = mul <4 x i16> %a, %b
108 store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
109 ret void
110}
111
112; GCN-LABEL: {{^}}mul_v8i16:
113; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
114; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
115; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
116; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
117; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
118; NOSDWA-NOT: v_mul_u32_u24_sdwa
119
Sam Kolton9fa16962017-04-06 15:03:28 +0000120; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
121; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
122; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
123; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
124; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
125; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
126; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
127; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
128; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
129; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
130; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
131; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
Sam Koltonf60ad582017-03-21 12:51:34 +0000132
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000133define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000134entry:
135 %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
136 %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
137 %mul = mul <8 x i16> %a, %b
138 store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
139 ret void
140}
141
142; GCN-LABEL: {{^}}mul_half:
143; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
144; NOSDWA-NOT: v_mul_f16_sdwa
145; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
146; SDWA-NOT: v_mul_f16_sdwa
147
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000148define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000149entry:
150 %a = load half, half addrspace(1)* %ina, align 4
151 %b = load half, half addrspace(1)* %inb, align 4
152 %mul = fmul half %a, %b
153 store half %mul, half addrspace(1)* %out, align 4
154 ret void
155}
156
157; GCN-LABEL: {{^}}mul_v2half:
158; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
159; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
160; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
161; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
162; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
163; NOSDWA-NOT: v_mul_f16_sdwa
164
Matt Arsenault8edfaee2017-03-31 19:53:03 +0000165; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
166; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
167; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000168define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000169entry:
170 %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
171 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
172 %mul = fmul <2 x half> %a, %b
173 store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
174 ret void
175}
176
177; GCN-LABEL: {{^}}mul_v4half:
178; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
179; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
180; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
181; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
182; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
183; NOSDWA-NOT: v_mul_f16_sdwa
184
Sam Kolton9fa16962017-04-06 15:03:28 +0000185; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
186; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
187; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
188; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
Sam Koltonf60ad582017-03-21 12:51:34 +0000189
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000190define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000191entry:
192 %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
193 %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
194 %mul = fmul <4 x half> %a, %b
195 store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
196 ret void
197}
198
199; GCN-LABEL: {{^}}mul_v8half:
200; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
201; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
202; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
203; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
204; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
205; NOSDWA-NOT: v_mul_f16_sdwa
206
Sam Kolton9fa16962017-04-06 15:03:28 +0000207; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
208; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
209; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
210; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
211; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
212; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
213; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
214; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
Sam Koltonf60ad582017-03-21 12:51:34 +0000215
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000216define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000217entry:
218 %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
219 %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
220 %mul = fmul <8 x half> %a, %b
221 store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
222 ret void
223}
224
225; GCN-LABEL: {{^}}mul_i8:
226; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
227; NOSDWA-NOT: v_mul_u32_u24_sdwa
228; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
229; SDWA-NOT: v_mul_u32_u24_sdwa
230
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000231define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000232entry:
233 %a = load i8, i8 addrspace(1)* %ina, align 4
234 %b = load i8, i8 addrspace(1)* %inb, align 4
235 %mul = mul i8 %a, %b
236 store i8 %mul, i8 addrspace(1)* %out, align 4
237 ret void
238}
239
240; GCN-LABEL: {{^}}mul_v2i8:
241; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
242; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
243; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
244; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
245; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
246; NOSDWA-NOT: v_mul_u32_u24_sdwa
247
248; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
249
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000250define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000251entry:
252 %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
253 %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
254 %mul = mul <2 x i8> %a, %b
255 store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
256 ret void
257}
258
259; GCN-LABEL: {{^}}mul_v4i8:
260; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
261; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
262; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
263; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
264; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
265; NOSDWA-NOT: v_mul_u32_u24_sdwa
266
Sam Kolton9fa16962017-04-06 15:03:28 +0000267; SDWA-DAG: v_mul_u32_u24_sdwa
268; SDWA-DAG: v_mul_u32_u24_sdwa
269; SDWA-DAG: v_mul_u32_u24_sdwa
Sam Koltonf60ad582017-03-21 12:51:34 +0000270
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000271define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000272entry:
273 %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
274 %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
275 %mul = mul <4 x i8> %a, %b
276 store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
277 ret void
278}
279
280; GCN-LABEL: {{^}}mul_v8i8:
281; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
282; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
283; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
284; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
285; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
286; NOSDWA-NOT: v_mul_u32_u24_sdwa
287
Sam Kolton9fa16962017-04-06 15:03:28 +0000288; SDWA-DAG: v_mul_u32_u24_sdwa
289; SDWA-DAG: v_mul_u32_u24_sdwa
290; SDWA-DAG: v_mul_u32_u24_sdwa
291; SDWA-DAG: v_mul_u32_u24_sdwa
292; SDWA-DAG: v_mul_u32_u24_sdwa
293; SDWA-DAG: v_mul_u32_u24_sdwa
Sam Koltonf60ad582017-03-21 12:51:34 +0000294
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000295define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000296entry:
297 %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
298 %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
299 %mul = mul <8 x i8> %a, %b
300 store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
301 ret void
302}
303
Sam Kolton9fa16962017-04-06 15:03:28 +0000304; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
305; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
306; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
307; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
308; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
309; NOSDWA-NOT: v_cvt_f32_i32_sdwa
310
311; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
312; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
313
314define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
315 <2 x half> addrspace(1)* %r,
316 <2 x i16> addrspace(1)* %a) {
317entry:
318 %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
319 %r.val = sitofp <2 x i16> %a.val to <2 x half>
320 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
321 ret void
322}
323
Sam Koltonf60ad582017-03-21 12:51:34 +0000324
325; GCN-LABEL: {{^}}mac_v2half:
326; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
327; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
328; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
329; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
330; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
331; NOSDWA-NOT: v_mac_f16_sdwa
332
333; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
334; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
335
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000336define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000337entry:
338 %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
339 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
340 %mul = fmul <2 x half> %a, %b
341 %mac = fadd <2 x half> %mul, %b
342 store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
343 ret void
344}
345
346; GCN-LABEL: {{^}}immediate_mul_v2i16:
347; NOSDWA-NOT: v_mul_u32_u24_sdwa
348; SDWA-NOT: v_mul_u32_u24_sdwa
349
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000350define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000351entry:
352 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
353 %mul = mul <2 x i16> %a, <i16 123, i16 321>
354 store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
355 ret void
356}
357
358; Double use of same src - should not convert it
359; GCN-LABEL: {{^}}mulmul_v2i16:
360; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
361; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
362; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
363; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
364; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
365; NOSDWA-NOT: v_mul_u32_u24_sdwa
366
367; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
368
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000369define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000370entry:
371 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
372 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
373 %mul = mul <2 x i16> %a, %b
374 %mul2 = mul <2 x i16> %mul, %b
375 store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
376 ret void
377}
378
Sam Koltonaff83412017-04-12 09:36:05 +0000379; GCN-LABEL: {{^}}add_bb_v2i16:
Sam Koltonf60ad582017-03-21 12:51:34 +0000380; NOSDWA-NOT: v_add_i32_sdwa
Sam Koltonf60ad582017-03-21 12:51:34 +0000381
Sam Koltonaff83412017-04-12 09:36:05 +0000382; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
383
384define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000385entry:
Sam Koltonaff83412017-04-12 09:36:05 +0000386 %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
387 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
388 br label %add_label
Sam Koltonf60ad582017-03-21 12:51:34 +0000389add_label:
Sam Koltonaff83412017-04-12 09:36:05 +0000390 %add = add <2 x i16> %a, %b
Sam Koltonf60ad582017-03-21 12:51:34 +0000391 br label %store_label
392store_label:
Sam Koltonaff83412017-04-12 09:36:05 +0000393 store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
Sam Koltonf60ad582017-03-21 12:51:34 +0000394 ret void
Sam Koltonaff83412017-04-12 09:36:05 +0000395}