blob: e2cc3ae0dca0af21974dfe014c6c9b230833e8e8 [file] [log] [blame]
Simon Pilgrim8112a252021-01-11 12:51:03 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00002; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
Simon Pilgrim8112a252021-01-11 12:51:03 +00008
9; Vectorized Pairwise Sum Reductions
10; e.g.
11; inline STYPE sum(VTYPE x) {
12; return (x[0] + x[1]) + (x[2] + x[3]);
13; }
14;
15; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
16; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
17; }
18
19define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
20; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21; SSSE3-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +000022; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
Simon Pilgrim53283cc2021-04-06 16:26:47 +010023; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
24; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0
25; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
Simon Pilgrim8112a252021-01-11 12:51:03 +000026; SSSE3-SLOW-NEXT: retq
27;
28; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
29; SSSE3-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +000030; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
Simon Pilgrimcbbfc822021-01-13 15:05:02 +000031; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
32; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +000033; SSSE3-FAST-NEXT: retq
34;
Amaury Sécheta70d5e22023-06-07 11:24:38 +000035; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32:
36; AVX-SLOW: # %bb.0:
37; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
38; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
39; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
40; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
41; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
42; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
43; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
44; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
45; AVX-SLOW-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +000046;
Simon Pilgrimcbbfc822021-01-13 15:05:02 +000047; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
48; AVX-FAST: # %bb.0:
49; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
50; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1
51; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
52; AVX-FAST-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +000053 %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
54 %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
55 %7 = fadd <2 x float> %5, %6
56 %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
57 %9 = fadd <2 x float> %7, %8
58 %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
59 %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
60 %12 = fadd <2 x float> %10, %11
61 %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
62 %14 = fadd <2 x float> %12, %13
63 %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
64 %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
65 %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
66 %18 = fadd <2 x float> %16, %17
67 %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
68 %20 = fadd <2 x float> %18, %19
69 %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70 %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
71 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
72 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
73 %25 = fadd <2 x float> %23, %24
74 %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
75 %27 = fadd <2 x float> %25, %26
76 %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
77 %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
78 ret <4 x float> %29
79}
80
81define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
82; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
83; SSSE3-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +000084; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +010085; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
86; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim81433942021-03-15 14:50:26 +000087; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
Simon Pilgrim53283cc2021-04-06 16:26:47 +010088; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3
Simon Pilgrim9acc03a2021-05-11 12:26:14 +010089; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
90; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
91; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
Simon Pilgrim8112a252021-01-11 12:51:03 +000092; SSSE3-SLOW-NEXT: retq
93;
94; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
95; SSSE3-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +000096; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
Simon Pilgrima1c892b2021-08-22 14:17:39 +010097; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
Simon Pilgrimcbbfc822021-01-13 15:05:02 +000098; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +000099; SSSE3-FAST-NEXT: retq
100;
101; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
102; AVX1-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000103; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100104; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
105; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
106; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000107; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
108; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
109; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
110; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
111; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
Simon Pilgrim41146bf2021-03-27 11:09:30 +0000112; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
113; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000114; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
115; AVX1-SLOW-NEXT: retq
116;
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000117; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
118; AVX-FAST: # %bb.0:
119; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
120; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
121; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
122; AVX-FAST-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +0000123;
124; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
125; AVX2-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000126; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000127; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
128; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000129; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000130; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
131; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
132; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000133; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000134; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000135; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
136; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
137; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000138; AVX2-SLOW-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +0000139 %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
140 %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
141 %7 = add <2 x i32> %5, %6
142 %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
143 %9 = add <2 x i32> %7, %8
144 %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
145 %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
146 %12 = add <2 x i32> %10, %11
147 %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
148 %14 = add <2 x i32> %12, %13
149 %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
150 %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
151 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
152 %18 = add <2 x i32> %16, %17
153 %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
154 %20 = add <2 x i32> %18, %19
155 %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
156 %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
157 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
158 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
159 %25 = add <2 x i32> %23, %24
160 %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
161 %27 = add <2 x i32> %25, %26
162 %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
163 %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
164 ret <4 x i32> %29
165}
166
167define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
168; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
169; SSSE3-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000170; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
171; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
Simon Pilgrim85460a22021-05-05 12:21:30 +0100172; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
173; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000174; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100175; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000176; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
177; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100178; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
179; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000180; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000181; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
182; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
183; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000184; SSSE3-SLOW-NEXT: retq
185;
186; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
187; SSSE3-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000188; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
Simon Pilgrimcbbfc822021-01-13 15:05:02 +0000189; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100190; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100191; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4
192; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2
193; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000194; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100195; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4
196; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000197; SSSE3-FAST-NEXT: retq
198;
199; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
200; AVX1-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000201; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
Noah Goldstein69a322f2023-02-16 11:57:12 -0600202; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
203; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100204; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
205; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
206; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
207; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
208; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
209; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100210; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
Simon Pilgrim352df102021-08-22 15:26:17 +0100211; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100212; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000213; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100214; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000215; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Simon Pilgrimabbe80f2021-04-01 10:08:08 +0100216; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
Simon Pilgrimd3b0fba2023-02-13 13:28:42 +0000217; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000218; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000220; AVX1-SLOW-NEXT: retq
221;
222; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
223; AVX1-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000224; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100225; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100226; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000227; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
Simon Pilgrim85460a22021-05-05 12:21:30 +0100228; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100229; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
230; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
231; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
Simon Pilgrim352df102021-08-22 15:26:17 +0100232; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100233; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000234; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100235; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000236; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Simon Pilgrimcbbfc822021-01-13 15:05:02 +0000237; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
238; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000239; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
240; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000241; AVX1-FAST-NEXT: retq
242;
243; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
244; AVX2-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000245; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
Noah Goldstein69a322f2023-02-16 11:57:12 -0600246; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
247; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100248; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000249; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000250; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
Simon Pilgrim8112a252021-01-11 12:51:03 +0000251; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000252; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
253; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
254; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
255; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
256; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000257; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100258; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrim49727222025-02-09 17:11:18 +0000259; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
260; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
261; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
262; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
263; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000264; AVX2-SLOW-NEXT: retq
265;
266; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
267; AVX2-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000268; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
Simon Pilgrimcbbfc822021-01-13 15:05:02 +0000269; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000270; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000271; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
Simon Pilgrim8112a252021-01-11 12:51:03 +0000272; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000273; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
274; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
275; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
276; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
277; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000278; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100279; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrim49727222025-02-09 17:11:18 +0000280; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
281; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
282; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
283; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
284; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000285; AVX2-FAST-NEXT: retq
286 %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
287 %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
288 %11 = fadd <2 x float> %9, %10
289 %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
290 %13 = fadd <2 x float> %11, %12
291 %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
292 %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
293 %16 = fadd <2 x float> %14, %15
294 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
295 %18 = fadd <2 x float> %16, %17
296 %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
297 %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
298 %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
299 %22 = fadd <2 x float> %20, %21
300 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
301 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
302 %25 = fadd <2 x float> %23, %24
303 %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2>
304 %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3>
305 %28 = fadd <2 x float> %26, %27
306 %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
307 %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308 %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309 %32 = fadd <2 x float> %30, %31
310 %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
311 %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
312 %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
313 %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
314 %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
315 %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
316 %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
317 %40 = fadd <4 x float> %36, %39
318 %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
319 %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
320 %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2>
321 %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3>
322 %45 = fadd <2 x float> %43, %44
323 %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2>
324 %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3>
325 %48 = fadd <2 x float> %46, %47
326 %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2>
327 %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3>
328 %51 = fadd <2 x float> %49, %50
329 %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
330 %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
331 ret <8 x float> %53
332}
333
334define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
335; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
336; SSSE3-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000337; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100338; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
339; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000340; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100341; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100342; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
343; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2
344; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
345; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000346; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
Simon Pilgrimabbe80f2021-04-01 10:08:08 +0100347; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
Simon Pilgrim9452ec72021-10-02 15:30:58 +0100348; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000350; SSSE3-SLOW-NEXT: retq
351;
352; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
353; SSSE3-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000354; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
Simon Pilgrimcbbfc822021-01-13 15:05:02 +0000355; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
Simon Pilgrim85460a22021-05-05 12:21:30 +0100356; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100357; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4
358; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2
Simon Pilgrim8112a252021-01-11 12:51:03 +0000359; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
360; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6
361; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7
362; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6
363; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
364; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
365; SSSE3-FAST-NEXT: retq
366;
367; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
368; AVX1-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000369; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100370; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100371; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
372; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
373; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
Simon Pilgrim6878be52021-03-14 22:27:57 +0000374; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
Simon Pilgrim85460a22021-05-05 12:21:30 +0100375; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100376; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100377; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
Simon Pilgrim6878be52021-03-14 22:27:57 +0000378; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100379; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100380; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
Simon Pilgrim41146bf2021-03-27 11:09:30 +0000381; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100382; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000383; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
384; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
385; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Simon Pilgrimabbe80f2021-04-01 10:08:08 +0100386; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2
387; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000388; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
389; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000390; AVX1-SLOW-NEXT: retq
391;
392; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32:
393; AVX1-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000394; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100395; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
396; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
397; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
Simon Pilgrim85460a22021-05-05 12:21:30 +0100398; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100399; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
401; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
404; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
405; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000406; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
407; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
408; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Simon Pilgrimcbbfc822021-01-13 15:05:02 +0000409; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
410; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2
Simon Pilgrimf7b10462023-02-06 09:55:03 +0000411; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
412; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000413; AVX1-FAST-NEXT: retq
414;
415; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
416; AVX2-SLOW: # %bb.0:
Simon Pilgrim81433942021-03-15 14:50:26 +0000417; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim9acc03a2021-05-11 12:26:14 +0100418; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
Simon Pilgrim85460a22021-05-05 12:21:30 +0100419; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
420; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000421; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
422; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
423; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000424; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
426; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
427; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
Simon Pilgrim41146bf2021-03-27 11:09:30 +0000429; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000430; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
431; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
432; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
433; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
Simon Pilgrimabbe80f2021-04-01 10:08:08 +0100434; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1
Simon Pilgrim4524d8b2021-05-08 16:19:18 +0100435; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000436; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
437; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
438; AVX2-SLOW-NEXT: retq
439;
440; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32:
441; AVX2-FAST: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000442; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim30131982021-03-31 17:36:34 +0100443; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
Roman Lebedev07f1d8f2021-09-19 17:32:43 +0300444; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
445; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
Simon Pilgrim8112a252021-01-11 12:51:03 +0000446; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000447; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
449; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
450; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
Roman Lebedev07f1d8f2021-09-19 17:32:43 +0300452; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
453; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
454; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
455; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
456; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
457; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1
458; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000459; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1
460; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
461; AVX2-FAST-NEXT: retq
462 %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
463 %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
464 %11 = add <2 x i32> %9, %10
465 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
466 %13 = add <2 x i32> %11, %12
467 %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
468 %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
469 %16 = add <2 x i32> %14, %15
470 %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
471 %18 = add <2 x i32> %16, %17
472 %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
473 %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
474 %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
475 %22 = add <2 x i32> %20, %21
476 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
477 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
478 %25 = add <2 x i32> %23, %24
479 %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
480 %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
481 %28 = add <2 x i32> %26, %27
482 %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
483 %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
484 %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
485 %32 = add <2 x i32> %30, %31
486 %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
487 %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
488 %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
489 %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
490 %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
491 %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
492 %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
493 %40 = add <4 x i32> %36, %39
494 %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
495 %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
496 %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
497 %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
498 %45 = add <2 x i32> %43, %44
499 %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
500 %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
501 %48 = add <2 x i32> %46, %47
502 %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2>
503 %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3>
504 %51 = add <2 x i32> %49, %50
505 %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
506 %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
507 ret <8 x i32> %53
508}
509
510; Vectorized Sequential Sum Reductions
511; e.g.
512; inline STYPE sum(VTYPE x) {
513; return ((x[0] + x[1]) + x[2]) + x[3];
514; }
515;
516; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
517; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
518; }
519
520define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
521; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
522; SSSE3-SLOW: # %bb.0:
Simon Pilgrim8112a252021-01-11 12:51:03 +0000523; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800524; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
525; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
526; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000527; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
528; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
529; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800530; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
531; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
532; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4
Simon Pilgrim81433942021-03-15 14:50:26 +0000533; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800534; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
Simon Pilgrim81433942021-03-15 14:50:26 +0000535; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
536; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800537; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
538; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
539; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000540; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800541; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
542; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
543; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
544; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000545; SSSE3-SLOW-NEXT: retq
546;
547; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
548; SSSE3-FAST: # %bb.0:
Jay Foad7b3bbd82023-10-09 12:31:32 +0100549; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
Simon Pilgrim8112a252021-01-11 12:51:03 +0000550; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800551; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
Jay Foad7b3bbd82023-10-09 12:31:32 +0100552; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
553; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
554; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000555; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
556; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800557; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
Jay Foad7b3bbd82023-10-09 12:31:32 +0100558; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
559; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800560; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
Jay Foad7b3bbd82023-10-09 12:31:32 +0100561; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
562; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
563; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
564; SSSE3-FAST-NEXT: addps %xmm1, %xmm2
Simon Pilgrim8112a252021-01-11 12:51:03 +0000565; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
Jay Foad7b3bbd82023-10-09 12:31:32 +0100566; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
567; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
568; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000569; SSSE3-FAST-NEXT: retq
570;
571; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
572; AVX-SLOW: # %bb.0:
573; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4
Simon Pilgrim8112a252021-01-11 12:51:03 +0000574; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100575; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000576; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
Simon Pilgrim8112a252021-01-11 12:51:03 +0000577; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
578; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000579; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
Simon Pilgrim81433942021-03-15 14:50:26 +0000580; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
Simon Pilgrim64687f2c2021-03-16 12:31:39 +0000581; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
582; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
583; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
584; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
585; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
586; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
587; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000588; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000589; AVX-SLOW-NEXT: retq
590;
591; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
592; AVX-FAST: # %bb.0:
593; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4
Simon Pilgrim8112a252021-01-11 12:51:03 +0000594; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrime9f94672023-04-23 11:48:50 +0100595; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000596; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
Simon Pilgrim8112a252021-01-11 12:51:03 +0000597; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000598; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
Simon Pilgrim81433942021-03-15 14:50:26 +0000599; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
Simon Pilgrim64687f2c2021-03-16 12:31:39 +0000600; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
601; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
602; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
603; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
604; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
605; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000606; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000607; AVX-FAST-NEXT: retq
608 %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
609 %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
610 %7 = fadd <2 x float> %5, %6
611 %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6>
612 %9 = fadd <2 x float> %8, %7
613 %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7>
614 %11 = fadd <2 x float> %10, %9
615 %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
616 %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
617 %14 = fadd <4 x float> %13, %2
618 %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
619 %16 = fadd <4 x float> %15, %14
620 %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
621 %18 = fadd <4 x float> %17, %16
622 %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
623 %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
624 %21 = fadd <4 x float> %20, %3
625 %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
626 %23 = fadd <4 x float> %22, %21
627 %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
628 %25 = fadd <4 x float> %24, %23
629 %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
630 ret <4 x float> %26
631}
632
633define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
634; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32:
635; SSSE3-SLOW: # %bb.0:
636; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4
637; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4
638; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
639; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim53283cc2021-04-06 16:26:47 +0100640; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
Simon Pilgrim8112a252021-01-11 12:51:03 +0000641; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
Simon Pilgrim81433942021-03-15 14:50:26 +0000642; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
Simon Pilgrim81433942021-03-15 14:50:26 +0000643; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
Philip Reames86eff6be2023-01-20 08:59:24 -0800644; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
Simon Pilgrim8112a252021-01-11 12:51:03 +0000645; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
Philip Reames86eff6be2023-01-20 08:59:24 -0800646; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
Simon Pilgrim53283cc2021-04-06 16:26:47 +0100647; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
Simon Pilgrim53283cc2021-04-06 16:26:47 +0100648; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6
649; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
650; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
651; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
652; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
653; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000654; SSSE3-SLOW-NEXT: retq
655;
656; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
657; SSSE3-FAST: # %bb.0:
658; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4
659; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4
660; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
661; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim53283cc2021-04-06 16:26:47 +0100662; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4
Simon Pilgrim81433942021-03-15 14:50:26 +0000663; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
664; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
Simon Pilgrim53283cc2021-04-06 16:26:47 +0100665; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
666; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5
667; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5
668; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
669; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6
670; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
671; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
672; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
673; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
674; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000675; SSSE3-FAST-NEXT: retq
676;
677; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
678; AVX1-SLOW: # %bb.0:
679; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
680; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
681; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim8e7618a2024-05-06 11:25:23 +0100682; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
683; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000684; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000685; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000686; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
687; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
688; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
Philip Reames86eff6be2023-01-20 08:59:24 -0800689; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
690; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Anton Sidorenkof8ed7092022-10-14 16:20:43 +0300691; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000692; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
Simon Pilgrim81433942021-03-15 14:50:26 +0000693; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000694; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
Philip Reames86eff6be2023-01-20 08:59:24 -0800695; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
696; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
697; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000698; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
699; AVX1-SLOW-NEXT: retq
700;
701; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32:
702; AVX1-FAST: # %bb.0:
703; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
704; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
705; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim8e7618a2024-05-06 11:25:23 +0100706; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
707; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000708; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000709; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
710; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
711; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
Philip Reames86eff6be2023-01-20 08:59:24 -0800712; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
713; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Anton Sidorenkof8ed7092022-10-14 16:20:43 +0300714; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +0000715; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000716; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
717; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
Philip Reames86eff6be2023-01-20 08:59:24 -0800718; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
719; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Simon Pilgrim8112a252021-01-11 12:51:03 +0000720; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
721; AVX1-FAST-NEXT: retq
722;
723; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32:
724; AVX2-SLOW: # %bb.0:
725; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
726; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
727; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim8e7618a2024-05-06 11:25:23 +0100728; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
729; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000730; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000731; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000732; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
733; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
734; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
Roman Lebedev07f1d8f2021-09-19 17:32:43 +0300735; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
Philip Reames86eff6be2023-01-20 08:59:24 -0800736; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Anton Sidorenkof8ed7092022-10-14 16:20:43 +0300737; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000738; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
739; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
740; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
741; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
742; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
743; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
744; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000745; AVX2-SLOW-NEXT: retq
746;
747; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
748; AVX2-FAST: # %bb.0:
749; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
750; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
751; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Simon Pilgrim8e7618a2024-05-06 11:25:23 +0100752; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
753; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000754; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000755; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
756; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
757; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
Roman Lebedev07f1d8f2021-09-19 17:32:43 +0300758; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
Philip Reames86eff6be2023-01-20 08:59:24 -0800759; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
Anton Sidorenkof8ed7092022-10-14 16:20:43 +0300760; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
Amaury Sécheta70d5e22023-06-07 11:24:38 +0000761; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
762; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
763; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
764; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
765; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
766; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
Simon Pilgrim8112a252021-01-11 12:51:03 +0000767; AVX2-FAST-NEXT: retq
768 %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
769 %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
770 %7 = add <2 x i32> %5, %6
771 %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6>
772 %9 = add <2 x i32> %8, %7
773 %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7>
774 %11 = add <2 x i32> %10, %9
775 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
776 %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
777 %14 = add <4 x i32> %13, %2
778 %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
779 %16 = add <4 x i32> %15, %14
780 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
781 %18 = add <4 x i32> %17, %16
782 %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
783 %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
784 %21 = add <4 x i32> %20, %3
785 %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
786 %23 = add <4 x i32> %22, %21
787 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
788 %25 = add <4 x i32> %24, %23
789 %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
790 ret <4 x i32> %26
791}
792
793; Vectorized Reductions
794; e.g.
795; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
796; return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) };
797; }
798
799define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000800; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
801; SSSE3-SLOW: # %bb.0:
802; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
803; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4
804; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
805; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
806; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
807; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
808; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0
809; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
810; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
811; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
812; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
813; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
814; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
815; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1
816; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
817; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
818; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1
819; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
820; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
821; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
822; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
823; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2
824; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
825; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1
826; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
827; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
828; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
829; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
830; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3
831; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
832; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
833; SSSE3-SLOW-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +0000834;
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000835; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
836; SSSE3-FAST: # %bb.0:
837; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
838; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
839; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
840; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
841; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
842; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
843; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
844; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
845; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
846; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
847; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
848; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
849; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
850; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
851; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
852; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
853; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
854; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
855; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
856; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
857; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
858; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
859; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
860; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
861; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
862; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
863; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
864; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
865; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
866; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
867; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
868; SSSE3-FAST-NEXT: retq
869;
870; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
871; AVX-SLOW: # %bb.0:
872; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
873; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
Simon Pilgrime9f94672023-04-23 11:48:50 +0100874; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000875; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
Noah Goldstein69a322f2023-02-16 11:57:12 -0600876; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000877; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
878; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
879; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
Simon Pilgrime9f94672023-04-23 11:48:50 +0100880; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000881; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
Noah Goldstein69a322f2023-02-16 11:57:12 -0600882; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000883; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
884; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
885; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
886; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +0100887; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000888; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
Noah Goldstein69a322f2023-02-16 11:57:12 -0600889; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000890; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
891; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
892; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
893; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +0100894; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000895; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
Noah Goldstein69a322f2023-02-16 11:57:12 -0600896; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000897; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
898; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
899; AVX-SLOW-NEXT: retq
900;
901; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
902; AVX-FAST: # %bb.0:
903; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
Simon Pilgrime9f94672023-04-23 11:48:50 +0100904; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000905; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
Noah Goldstein69a322f2023-02-16 11:57:12 -0600906; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000907; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
908; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
Simon Pilgrime9f94672023-04-23 11:48:50 +0100909; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000910; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
Noah Goldstein69a322f2023-02-16 11:57:12 -0600911; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000912; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
913; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
914; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +0100915; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000916; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
Noah Goldstein69a322f2023-02-16 11:57:12 -0600917; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000918; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
919; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
920; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +0100921; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000922; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
Noah Goldstein69a322f2023-02-16 11:57:12 -0600923; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000924; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
925; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
926; AVX-FAST-NEXT: retq
927 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
928 %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
929 %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
930 %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
Simon Pilgrim8112a252021-01-11 12:51:03 +0000931 %9 = insertelement <4 x float> undef, float %5, i32 0
932 %10 = insertelement <4 x float> %9, float %6, i32 1
933 %11 = insertelement <4 x float> %10, float %7, i32 2
934 %12 = insertelement <4 x float> %11, float %8, i32 3
935 ret <4 x float> %12
936}
937declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
938
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000939define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
940; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
941; SSSE3-SLOW: # %bb.0:
942; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
Simon Pilgrim81433942021-03-15 14:50:26 +0000943; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800944; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
945; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
Simon Pilgrim81433942021-03-15 14:50:26 +0000946; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
947; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
948; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
949; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800950; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000951; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
952; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
953; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
Simon Pilgrim81433942021-03-15 14:50:26 +0000954; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
955; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
956; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
957; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
958; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800959; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
960; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
Simon Pilgrim81433942021-03-15 14:50:26 +0000961; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
Guozhi Weif1d8345a2021-11-29 19:01:59 -0800962; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
Guozhi Wei65999612021-10-11 15:28:31 -0700963; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000964; SSSE3-SLOW-NEXT: retq
965;
966; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
967; SSSE3-FAST: # %bb.0:
968; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
969; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
Guozhi Wei65999612021-10-11 15:28:31 -0700970; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
971; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
972; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
973; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
974; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
975; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
976; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
977; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
978; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
979; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
980; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
981; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
982; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000983; SSSE3-FAST-NEXT: retq
984;
985; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
986; AVX-SLOW: # %bb.0:
Simon Pilgrime9f94672023-04-23 11:48:50 +0100987; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000988; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
Simon Pilgrime9f94672023-04-23 11:48:50 +0100989; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000990; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +0100991; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
Simon Pilgrim81433942021-03-15 14:50:26 +0000992; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2
Simon Pilgrime9f94672023-04-23 11:48:50 +0100993; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
Simon Pilgrim81433942021-03-15 14:50:26 +0000994; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3
995; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
996; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
997; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
998; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +0000999; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Simon Pilgrim81433942021-03-15 14:50:26 +00001000; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1001; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001002; AVX-SLOW-NEXT: retq
1003;
1004; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1005; AVX-FAST: # %bb.0:
Simon Pilgrime9f94672023-04-23 11:48:50 +01001006; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001007; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
Simon Pilgrime9f94672023-04-23 11:48:50 +01001008; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001009; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
1010; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
Simon Pilgrime9f94672023-04-23 11:48:50 +01001011; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001012; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
Simon Pilgrime9f94672023-04-23 11:48:50 +01001013; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001014; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
Simon Pilgrima1c892b2021-08-22 14:17:39 +01001015; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1016; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
Simon Pilgrim8f1d7f32021-01-14 11:05:04 +00001017; AVX-FAST-NEXT: retq
1018 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1019 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
1020 %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
1021 %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
1022 %9 = insertelement <4 x float> undef, float %5, i32 0
1023 %10 = insertelement <4 x float> %9, float %6, i32 1
1024 %11 = insertelement <4 x float> %10, float %7, i32 2
1025 %12 = insertelement <4 x float> %11, float %8, i32 3
1026 ret <4 x float> %12
1027}
1028
Simon Pilgrim8112a252021-01-11 12:51:03 +00001029define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
1030; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1031; SSSE3-SLOW: # %bb.0:
1032; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
Guozhi Weif1d8345a2021-11-29 19:01:59 -08001033; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
1034; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
Simon Pilgrim81433942021-03-15 14:50:26 +00001035; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1036; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
1037; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
Guozhi Weif1d8345a2021-11-29 19:01:59 -08001038; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
Simon Pilgrim8112a252021-01-11 12:51:03 +00001039; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1040; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
1041; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
Simon Pilgrim81433942021-03-15 14:50:26 +00001042; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1043; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
1044; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
Simon Pilgrim8112a252021-01-11 12:51:03 +00001045; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
Guozhi Weif1d8345a2021-11-29 19:01:59 -08001046; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
Simon Pilgrim81433942021-03-15 14:50:26 +00001047; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
Guozhi Weif1d8345a2021-11-29 19:01:59 -08001048; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1049; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1050; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
Simon Pilgrim8112a252021-01-11 12:51:03 +00001051; SSSE3-SLOW-NEXT: retq
1052;
1053; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1054; SSSE3-FAST: # %bb.0:
1055; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1056; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1057; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1058; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1059; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1060; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1061; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1062; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1063; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1064; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1065; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066; SSSE3-FAST-NEXT: retq
1067;
Roman Lebedev07f1d8f2021-09-19 17:32:43 +03001068; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1069; AVX1-SLOW: # %bb.0:
1070; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1071; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1072; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1073; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1074; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1075; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
1076; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1077; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1078; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1079; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1080; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1081; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3
1082; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
1083; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1084; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1085; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1086; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1087; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1088; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1089; AVX1-SLOW-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +00001090;
Simon Pilgrima1c892b2021-08-22 14:17:39 +01001091; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1092; AVX-FAST: # %bb.0:
1093; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1094; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1095; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1096; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1097; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1098; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1099; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1100; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1101; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1102; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1103; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1104; AVX-FAST-NEXT: retq
Roman Lebedev07f1d8f2021-09-19 17:32:43 +03001105;
1106; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1107; AVX2-SLOW: # %bb.0:
1108; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1109; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1110; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1111; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1112; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1113; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1114; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1115; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
1116; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3
1117; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1118; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
1119; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
1120; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1
1122; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2
1123; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1124; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1125; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1126; AVX2-SLOW-NEXT: retq
Simon Pilgrim8112a252021-01-11 12:51:03 +00001127 %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
1128 %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
1129 %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
1130 %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3)
1131 %9 = insertelement <4 x i32> undef, i32 %5, i32 0
1132 %10 = insertelement <4 x i32> %9, i32 %6, i32 1
1133 %11 = insertelement <4 x i32> %10, i32 %7, i32 2
1134 %12 = insertelement <4 x i32> %11, i32 %8, i32 3
1135 ret <4 x i32> %12
1136}
1137declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)