blob: a034363895c0e17fbb66c038cfc597c4c1477080 [file] [log] [blame]
Roman Lebedev7b312e22021-04-25 21:35:05 +03001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08002; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
Roman Lebedev7b312e22021-04-25 21:35:05 +030015
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
Roman Lebedev6893b152022-12-13 04:04:42 +030018define void @store_i16_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i16_stride2_vf2:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +010020; SSE: # %bb.0:
21; SSE-NEXT: movdqa (%rdi), %xmm0
22; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
23; SSE-NEXT: movq %xmm0, (%rdx)
24; SSE-NEXT: retq
25;
Roman Lebedev6893b152022-12-13 04:04:42 +030026; AVX-LABEL: store_i16_stride2_vf2:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +010027; AVX: # %bb.0:
28; AVX-NEXT: vmovdqa (%rdi), %xmm0
29; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
30; AVX-NEXT: vmovq %xmm0, (%rdx)
31; AVX-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +080032;
33; AVX2-LABEL: store_i16_stride2_vf2:
34; AVX2: # %bb.0:
35; AVX2-NEXT: vmovdqa (%rdi), %xmm0
36; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
37; AVX2-NEXT: vmovq %xmm0, (%rdx)
38; AVX2-NEXT: retq
39;
40; AVX2-FP-LABEL: store_i16_stride2_vf2:
41; AVX2-FP: # %bb.0:
42; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
43; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
44; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
45; AVX2-FP-NEXT: retq
46;
47; AVX2-FCP-LABEL: store_i16_stride2_vf2:
48; AVX2-FCP: # %bb.0:
49; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
50; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
51; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
52; AVX2-FCP-NEXT: retq
53;
54; AVX512-LABEL: store_i16_stride2_vf2:
55; AVX512: # %bb.0:
56; AVX512-NEXT: vmovdqa (%rdi), %xmm0
57; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
58; AVX512-NEXT: vmovq %xmm0, (%rdx)
59; AVX512-NEXT: retq
60;
61; AVX512-FCP-LABEL: store_i16_stride2_vf2:
62; AVX512-FCP: # %bb.0:
63; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
64; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
65; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
66; AVX512-FCP-NEXT: retq
67;
68; AVX512DQ-LABEL: store_i16_stride2_vf2:
69; AVX512DQ: # %bb.0:
70; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
71; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
72; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
73; AVX512DQ-NEXT: retq
74;
75; AVX512DQ-FCP-LABEL: store_i16_stride2_vf2:
76; AVX512DQ-FCP: # %bb.0:
77; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
78; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
79; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
80; AVX512DQ-FCP-NEXT: retq
81;
82; AVX512BW-LABEL: store_i16_stride2_vf2:
83; AVX512BW: # %bb.0:
84; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
85; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
86; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
87; AVX512BW-NEXT: retq
88;
89; AVX512BW-FCP-LABEL: store_i16_stride2_vf2:
90; AVX512BW-FCP: # %bb.0:
91; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
92; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
93; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
94; AVX512BW-FCP-NEXT: retq
95;
96; AVX512DQ-BW-LABEL: store_i16_stride2_vf2:
97; AVX512DQ-BW: # %bb.0:
98; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
99; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
100; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
101; AVX512DQ-BW-NEXT: retq
102;
103; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf2:
104; AVX512DQ-BW-FCP: # %bb.0:
105; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
106; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
107; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
108; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300109 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
110 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
111 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112 %interleaved.vec = shufflevector <4 x i16> %1, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
113 store <4 x i16> %interleaved.vec, ptr %out.vec, align 64
Roman Lebedev7b312e22021-04-25 21:35:05 +0300114 ret void
115}
116
Roman Lebedev6893b152022-12-13 04:04:42 +0300117define void @store_i16_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
118; SSE-LABEL: store_i16_stride2_vf4:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100119; SSE: # %bb.0:
120; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
121; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
David Green115c1882022-05-06 10:50:31 +0100122; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
123; SSE-NEXT: movdqa %xmm1, (%rdx)
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100124; SSE-NEXT: retq
125;
Roman Lebedev6893b152022-12-13 04:04:42 +0300126; AVX-LABEL: store_i16_stride2_vf4:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100127; AVX: # %bb.0:
128; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
129; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
David Green115c1882022-05-06 10:50:31 +0100130; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100131; AVX-NEXT: vmovdqa %xmm0, (%rdx)
132; AVX-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800133;
134; AVX2-LABEL: store_i16_stride2_vf4:
135; AVX2: # %bb.0:
136; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
137; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
138; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
139; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
140; AVX2-NEXT: retq
141;
142; AVX2-FP-LABEL: store_i16_stride2_vf4:
143; AVX2-FP: # %bb.0:
144; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
145; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
146; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
147; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx)
148; AVX2-FP-NEXT: retq
149;
150; AVX2-FCP-LABEL: store_i16_stride2_vf4:
151; AVX2-FCP: # %bb.0:
152; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
153; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
154; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
155; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx)
156; AVX2-FCP-NEXT: retq
157;
158; AVX512-LABEL: store_i16_stride2_vf4:
159; AVX512: # %bb.0:
160; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
161; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
162; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
163; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
164; AVX512-NEXT: retq
165;
166; AVX512-FCP-LABEL: store_i16_stride2_vf4:
167; AVX512-FCP: # %bb.0:
168; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
169; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
170; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
171; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
172; AVX512-FCP-NEXT: retq
173;
174; AVX512DQ-LABEL: store_i16_stride2_vf4:
175; AVX512DQ: # %bb.0:
176; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
177; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
178; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
179; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
180; AVX512DQ-NEXT: retq
181;
182; AVX512DQ-FCP-LABEL: store_i16_stride2_vf4:
183; AVX512DQ-FCP: # %bb.0:
184; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
185; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
186; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
187; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
188; AVX512DQ-FCP-NEXT: retq
189;
190; AVX512BW-LABEL: store_i16_stride2_vf4:
191; AVX512BW: # %bb.0:
192; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
193; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
194; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
195; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
196; AVX512BW-NEXT: retq
197;
198; AVX512BW-FCP-LABEL: store_i16_stride2_vf4:
199; AVX512BW-FCP: # %bb.0:
200; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
201; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
202; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
203; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
204; AVX512BW-FCP-NEXT: retq
205;
206; AVX512DQ-BW-LABEL: store_i16_stride2_vf4:
207; AVX512DQ-BW: # %bb.0:
208; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
209; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
210; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
211; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
212; AVX512DQ-BW-NEXT: retq
213;
214; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf4:
215; AVX512DQ-BW-FCP: # %bb.0:
216; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
217; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
218; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
219; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
220; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300221 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
222 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64
223 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
224 %interleaved.vec = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
225 store <8 x i16> %interleaved.vec, ptr %out.vec, align 64
Roman Lebedev7b312e22021-04-25 21:35:05 +0300226 ret void
227}
228
Roman Lebedev6893b152022-12-13 04:04:42 +0300229define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
230; SSE-LABEL: store_i16_stride2_vf8:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100231; SSE: # %bb.0:
232; SSE-NEXT: movdqa (%rdi), %xmm0
233; SSE-NEXT: movdqa (%rsi), %xmm1
234; SSE-NEXT: movdqa %xmm0, %xmm2
235; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
236; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
237; SSE-NEXT: movdqa %xmm0, 16(%rdx)
238; SSE-NEXT: movdqa %xmm2, (%rdx)
239; SSE-NEXT: retq
240;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800241; AVX-LABEL: store_i16_stride2_vf8:
242; AVX: # %bb.0:
243; AVX-NEXT: vmovdqa (%rdi), %xmm0
244; AVX-NEXT: vmovdqa (%rsi), %xmm1
245; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
246; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
247; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
248; AVX-NEXT: vmovdqa %xmm2, (%rdx)
249; AVX-NEXT: retq
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100250;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800251; AVX2-LABEL: store_i16_stride2_vf8:
252; AVX2: # %bb.0:
253; AVX2-NEXT: vmovdqa (%rdi), %xmm0
254; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
255; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
256; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
257; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
258; AVX2-NEXT: vzeroupper
259; AVX2-NEXT: retq
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100260;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800261; AVX2-FP-LABEL: store_i16_stride2_vf8:
262; AVX2-FP: # %bb.0:
263; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
264; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
265; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
266; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
267; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
268; AVX2-FP-NEXT: vzeroupper
269; AVX2-FP-NEXT: retq
270;
271; AVX2-FCP-LABEL: store_i16_stride2_vf8:
272; AVX2-FCP: # %bb.0:
273; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
274; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
275; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
276; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
277; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
278; AVX2-FCP-NEXT: vzeroupper
279; AVX2-FCP-NEXT: retq
280;
281; AVX512-LABEL: store_i16_stride2_vf8:
282; AVX512: # %bb.0:
283; AVX512-NEXT: vmovdqa (%rdi), %xmm0
284; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
285; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
286; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
287; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
288; AVX512-NEXT: vzeroupper
289; AVX512-NEXT: retq
290;
291; AVX512-FCP-LABEL: store_i16_stride2_vf8:
292; AVX512-FCP: # %bb.0:
293; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
294; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
295; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
296; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
297; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx)
298; AVX512-FCP-NEXT: vzeroupper
299; AVX512-FCP-NEXT: retq
300;
301; AVX512DQ-LABEL: store_i16_stride2_vf8:
302; AVX512DQ: # %bb.0:
303; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
304; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
305; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
306; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
307; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
308; AVX512DQ-NEXT: vzeroupper
309; AVX512DQ-NEXT: retq
310;
311; AVX512DQ-FCP-LABEL: store_i16_stride2_vf8:
312; AVX512DQ-FCP: # %bb.0:
313; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
314; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
315; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
316; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
317; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx)
318; AVX512DQ-FCP-NEXT: vzeroupper
319; AVX512DQ-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300320;
321; AVX512BW-LABEL: store_i16_stride2_vf8:
322; AVX512BW: # %bb.0:
323; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
324; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000325; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
Roman Lebedev6893b152022-12-13 04:04:42 +0300326; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
327; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
328; AVX512BW-NEXT: vzeroupper
329; AVX512BW-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800330;
331; AVX512BW-FCP-LABEL: store_i16_stride2_vf8:
332; AVX512BW-FCP: # %bb.0:
333; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
334; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000335; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800336; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
337; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
338; AVX512BW-FCP-NEXT: vzeroupper
339; AVX512BW-FCP-NEXT: retq
340;
341; AVX512DQ-BW-LABEL: store_i16_stride2_vf8:
342; AVX512DQ-BW: # %bb.0:
343; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
344; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000345; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800346; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
347; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx)
348; AVX512DQ-BW-NEXT: vzeroupper
349; AVX512DQ-BW-NEXT: retq
350;
351; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf8:
352; AVX512DQ-BW-FCP: # %bb.0:
353; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
354; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000355; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800356; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
357; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
358; AVX512DQ-BW-FCP-NEXT: vzeroupper
359; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300360 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64
361 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64
362 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
363 %interleaved.vec = shufflevector <16 x i16> %1, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
364 store <16 x i16> %interleaved.vec, ptr %out.vec, align 64
Roman Lebedev7b312e22021-04-25 21:35:05 +0300365 ret void
366}
367
Roman Lebedev6893b152022-12-13 04:04:42 +0300368define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
369; SSE-LABEL: store_i16_stride2_vf16:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100370; SSE: # %bb.0:
371; SSE-NEXT: movdqa (%rdi), %xmm0
372; SSE-NEXT: movdqa 16(%rdi), %xmm1
373; SSE-NEXT: movdqa (%rsi), %xmm2
374; SSE-NEXT: movdqa 16(%rsi), %xmm3
375; SSE-NEXT: movdqa %xmm0, %xmm4
376; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
377; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
378; SSE-NEXT: movdqa %xmm1, %xmm2
379; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
380; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
381; SSE-NEXT: movdqa %xmm1, 32(%rdx)
382; SSE-NEXT: movdqa %xmm2, 48(%rdx)
383; SSE-NEXT: movdqa %xmm0, (%rdx)
384; SSE-NEXT: movdqa %xmm4, 16(%rdx)
385; SSE-NEXT: retq
386;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800387; AVX-LABEL: store_i16_stride2_vf16:
388; AVX: # %bb.0:
389; AVX-NEXT: vmovdqa (%rsi), %xmm0
390; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
391; AVX-NEXT: vmovdqa (%rdi), %xmm2
392; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
393; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
394; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
395; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
396; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
397; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
398; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
399; AVX-NEXT: vmovdqa %xmm0, (%rdx)
400; AVX-NEXT: vmovdqa %xmm4, 16(%rdx)
401; AVX-NEXT: retq
Han Zhud0d48a92022-09-21 18:01:49 -0700402;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800403; AVX2-LABEL: store_i16_stride2_vf16:
404; AVX2: # %bb.0:
405; AVX2-NEXT: vmovdqa (%rdi), %ymm0
406; AVX2-NEXT: vmovdqa (%rsi), %ymm1
407; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
408; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
409; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
410; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
411; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
412; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
413; AVX2-NEXT: vzeroupper
414; AVX2-NEXT: retq
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100415;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800416; AVX2-FP-LABEL: store_i16_stride2_vf16:
417; AVX2-FP: # %bb.0:
418; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
419; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1
420; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
421; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
422; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
423; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
424; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx)
425; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx)
426; AVX2-FP-NEXT: vzeroupper
427; AVX2-FP-NEXT: retq
428;
429; AVX2-FCP-LABEL: store_i16_stride2_vf16:
430; AVX2-FCP: # %bb.0:
431; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
432; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1
433; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
434; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
435; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
436; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
437; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx)
438; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx)
439; AVX2-FCP-NEXT: vzeroupper
440; AVX2-FCP-NEXT: retq
441;
442; AVX512-LABEL: store_i16_stride2_vf16:
443; AVX512: # %bb.0:
444; AVX512-NEXT: vmovdqa (%rsi), %xmm0
445; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
446; AVX512-NEXT: vmovdqa (%rdi), %xmm2
447; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
448; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
449; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
450; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
451; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
452; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
453; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx)
454; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
455; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx)
456; AVX512-NEXT: retq
457;
458; AVX512-FCP-LABEL: store_i16_stride2_vf16:
459; AVX512-FCP: # %bb.0:
460; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
461; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
462; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
463; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
464; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
465; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
466; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
467; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
468; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
469; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
470; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
471; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
472; AVX512-FCP-NEXT: retq
473;
474; AVX512DQ-LABEL: store_i16_stride2_vf16:
475; AVX512DQ: # %bb.0:
476; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
477; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
478; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
479; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
480; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
481; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
482; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
483; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
484; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
485; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx)
486; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
487; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx)
488; AVX512DQ-NEXT: retq
489;
490; AVX512DQ-FCP-LABEL: store_i16_stride2_vf16:
491; AVX512DQ-FCP: # %bb.0:
492; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
493; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
494; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
495; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
496; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
497; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
498; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
499; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
500; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
501; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
502; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
503; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
504; AVX512DQ-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300505;
506; AVX512BW-LABEL: store_i16_stride2_vf16:
507; AVX512BW: # %bb.0:
508; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
509; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000510; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
Roman Lebedev6893b152022-12-13 04:04:42 +0300511; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
512; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
513; AVX512BW-NEXT: vzeroupper
514; AVX512BW-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800515;
516; AVX512BW-FCP-LABEL: store_i16_stride2_vf16:
517; AVX512BW-FCP: # %bb.0:
518; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
519; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000520; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800521; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
522; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
523; AVX512BW-FCP-NEXT: vzeroupper
524; AVX512BW-FCP-NEXT: retq
525;
526; AVX512DQ-BW-LABEL: store_i16_stride2_vf16:
527; AVX512DQ-BW: # %bb.0:
528; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
529; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000530; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800531; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
532; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
533; AVX512DQ-BW-NEXT: vzeroupper
534; AVX512DQ-BW-NEXT: retq
535;
536; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf16:
537; AVX512DQ-BW-FCP: # %bb.0:
538; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
539; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000540; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800541; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
542; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
543; AVX512DQ-BW-FCP-NEXT: vzeroupper
544; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300545 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64
546 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64
547 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
548 %interleaved.vec = shufflevector <32 x i16> %1, <32 x i16> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
549 store <32 x i16> %interleaved.vec, ptr %out.vec, align 64
Roman Lebedev7b312e22021-04-25 21:35:05 +0300550 ret void
551}
Roman Lebedevab7f26d2021-05-26 16:55:06 +0300552
Roman Lebedev6893b152022-12-13 04:04:42 +0300553define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
554; SSE-LABEL: store_i16_stride2_vf32:
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100555; SSE: # %bb.0:
556; SSE-NEXT: movdqa (%rdi), %xmm0
557; SSE-NEXT: movdqa 16(%rdi), %xmm1
558; SSE-NEXT: movdqa 32(%rdi), %xmm2
559; SSE-NEXT: movdqa 48(%rdi), %xmm3
560; SSE-NEXT: movdqa (%rsi), %xmm4
561; SSE-NEXT: movdqa 16(%rsi), %xmm5
562; SSE-NEXT: movdqa 32(%rsi), %xmm6
Matthias Braun189900e2022-08-16 10:35:33 -0700563; SSE-NEXT: movdqa 48(%rsi), %xmm7
564; SSE-NEXT: movdqa %xmm0, %xmm8
565; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100566; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
567; SSE-NEXT: movdqa %xmm1, %xmm4
568; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
569; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
570; SSE-NEXT: movdqa %xmm2, %xmm5
571; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
572; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
573; SSE-NEXT: movdqa %xmm3, %xmm6
Matthias Braun189900e2022-08-16 10:35:33 -0700574; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
575; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100576; SSE-NEXT: movdqa %xmm3, 96(%rdx)
577; SSE-NEXT: movdqa %xmm6, 112(%rdx)
578; SSE-NEXT: movdqa %xmm2, 64(%rdx)
579; SSE-NEXT: movdqa %xmm5, 80(%rdx)
580; SSE-NEXT: movdqa %xmm1, 32(%rdx)
581; SSE-NEXT: movdqa %xmm4, 48(%rdx)
582; SSE-NEXT: movdqa %xmm0, (%rdx)
Matthias Braun189900e2022-08-16 10:35:33 -0700583; SSE-NEXT: movdqa %xmm8, 16(%rdx)
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100584; SSE-NEXT: retq
585;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800586; AVX-LABEL: store_i16_stride2_vf32:
587; AVX: # %bb.0:
588; AVX-NEXT: vmovdqa (%rsi), %xmm0
589; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
590; AVX-NEXT: vmovdqa 32(%rsi), %xmm2
591; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
592; AVX-NEXT: vmovdqa (%rdi), %xmm4
593; AVX-NEXT: vmovdqa 16(%rdi), %xmm5
594; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
595; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
596; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
597; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
598; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
599; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
600; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
601; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
602; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
603; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
604; AVX-NEXT: vmovdqa %xmm0, (%rdx)
605; AVX-NEXT: vmovdqa %xmm5, 16(%rdx)
606; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
607; AVX-NEXT: vmovdqa %xmm7, 48(%rdx)
608; AVX-NEXT: vmovdqa %xmm3, 96(%rdx)
609; AVX-NEXT: vmovdqa %xmm6, 112(%rdx)
610; AVX-NEXT: vmovdqa %xmm2, 64(%rdx)
611; AVX-NEXT: vmovdqa %xmm8, 80(%rdx)
612; AVX-NEXT: retq
Han Zhud0d48a92022-09-21 18:01:49 -0700613;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800614; AVX2-LABEL: store_i16_stride2_vf32:
615; AVX2: # %bb.0:
616; AVX2-NEXT: vmovdqa (%rdi), %ymm0
617; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
618; AVX2-NEXT: vmovdqa (%rsi), %ymm2
619; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3
620; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
621; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
622; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
623; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
624; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
625; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
626; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
627; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
628; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx)
629; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx)
630; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
631; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx)
632; AVX2-NEXT: vzeroupper
633; AVX2-NEXT: retq
Simon Pilgrim31d0c8f2021-10-03 12:31:22 +0100634;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800635; AVX2-FP-LABEL: store_i16_stride2_vf32:
636; AVX2-FP: # %bb.0:
637; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
638; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
639; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2
640; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3
641; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
642; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
643; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
644; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
645; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
646; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
647; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
648; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
649; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx)
650; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rdx)
651; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
652; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx)
653; AVX2-FP-NEXT: vzeroupper
654; AVX2-FP-NEXT: retq
655;
656; AVX2-FCP-LABEL: store_i16_stride2_vf32:
657; AVX2-FCP: # %bb.0:
658; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
659; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
660; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2
661; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
662; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
663; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
664; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
665; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
666; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
667; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
668; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
669; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
670; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx)
671; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rdx)
672; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
673; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx)
674; AVX2-FCP-NEXT: vzeroupper
675; AVX2-FCP-NEXT: retq
676;
677; AVX512-LABEL: store_i16_stride2_vf32:
678; AVX512: # %bb.0:
679; AVX512-NEXT: vmovdqa (%rsi), %xmm0
680; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
681; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2
682; AVX512-NEXT: vmovdqa 48(%rsi), %xmm3
683; AVX512-NEXT: vmovdqa (%rdi), %xmm4
684; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
685; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
686; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7
687; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
688; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
689; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
690; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
691; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
692; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
693; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
694; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
695; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx)
696; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx)
697; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx)
698; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx)
699; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
700; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
701; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
702; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx)
703; AVX512-NEXT: retq
704;
705; AVX512-FCP-LABEL: store_i16_stride2_vf32:
706; AVX512-FCP: # %bb.0:
707; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
708; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
709; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
710; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
711; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
712; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
713; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
714; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
715; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
716; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
717; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
718; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
719; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
720; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
721; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
722; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
723; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
724; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
725; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
726; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
727; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
728; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
729; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
730; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
731; AVX512-FCP-NEXT: retq
732;
733; AVX512DQ-LABEL: store_i16_stride2_vf32:
734; AVX512DQ: # %bb.0:
735; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
736; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
737; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2
738; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm3
739; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4
740; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
741; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
742; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7
743; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
744; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
745; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
746; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
747; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
748; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
749; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
750; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
751; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx)
752; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx)
753; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx)
754; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx)
755; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
756; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
757; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
758; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx)
759; AVX512DQ-NEXT: retq
760;
761; AVX512DQ-FCP-LABEL: store_i16_stride2_vf32:
762; AVX512DQ-FCP: # %bb.0:
763; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
764; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
765; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
766; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
767; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
768; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
769; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
770; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
771; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
772; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
773; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
774; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
775; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
776; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
777; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
778; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
779; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
780; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
781; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
782; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
783; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
784; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
785; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
786; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
787; AVX512DQ-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300788;
789; AVX512BW-LABEL: store_i16_stride2_vf32:
790; AVX512BW: # %bb.0:
791; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
792; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000793; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Roman Lebedev6893b152022-12-13 04:04:42 +0300794; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000795; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Roman Lebedev6893b152022-12-13 04:04:42 +0300796; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
797; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
798; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
799; AVX512BW-NEXT: vzeroupper
800; AVX512BW-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800801;
802; AVX512BW-FCP-LABEL: store_i16_stride2_vf32:
803; AVX512BW-FCP: # %bb.0:
804; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
805; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000806; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800807; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000808; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800809; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
810; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
811; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
812; AVX512BW-FCP-NEXT: vzeroupper
813; AVX512BW-FCP-NEXT: retq
814;
815; AVX512DQ-BW-LABEL: store_i16_stride2_vf32:
816; AVX512DQ-BW: # %bb.0:
817; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
818; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000819; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800820; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000821; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800822; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
823; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
824; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx)
825; AVX512DQ-BW-NEXT: vzeroupper
826; AVX512DQ-BW-NEXT: retq
827;
828; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf32:
829; AVX512DQ-BW-FCP: # %bb.0:
830; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
831; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000832; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800833; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +0000834; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800835; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
836; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
837; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
838; AVX512DQ-BW-FCP-NEXT: vzeroupper
839; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300840 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64
841 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64
842 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
843 %interleaved.vec = shufflevector <64 x i16> %1, <64 x i16> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
844 store <64 x i16> %interleaved.vec, ptr %out.vec, align 64
Roman Lebedevab7f26d2021-05-26 16:55:06 +0300845 ret void
846}
Roman Lebedev6893b152022-12-13 04:04:42 +0300847
848define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
849; SSE-LABEL: store_i16_stride2_vf64:
850; SSE: # %bb.0:
851; SSE-NEXT: movdqa 112(%rdi), %xmm0
852; SSE-NEXT: movdqa 96(%rdi), %xmm6
853; SSE-NEXT: movdqa 80(%rdi), %xmm4
854; SSE-NEXT: movdqa 64(%rdi), %xmm3
Jay Foad7b3bbd82023-10-09 12:31:32 +0100855; SSE-NEXT: movdqa (%rdi), %xmm8
Roman Lebedev6893b152022-12-13 04:04:42 +0300856; SSE-NEXT: movdqa 16(%rdi), %xmm1
857; SSE-NEXT: movdqa 32(%rdi), %xmm2
858; SSE-NEXT: movdqa 48(%rdi), %xmm5
Jay Foad7b3bbd82023-10-09 12:31:32 +0100859; SSE-NEXT: movdqa 96(%rsi), %xmm11
860; SSE-NEXT: movdqa 80(%rsi), %xmm12
861; SSE-NEXT: movdqa 64(%rsi), %xmm13
Roman Lebedev6893b152022-12-13 04:04:42 +0300862; SSE-NEXT: movdqa (%rsi), %xmm9
Jay Foad7b3bbd82023-10-09 12:31:32 +0100863; SSE-NEXT: movdqa 16(%rsi), %xmm10
Roman Lebedev6893b152022-12-13 04:04:42 +0300864; SSE-NEXT: movdqa 32(%rsi), %xmm14
865; SSE-NEXT: movdqa 48(%rsi), %xmm15
Jay Foad7b3bbd82023-10-09 12:31:32 +0100866; SSE-NEXT: movdqa %xmm8, %xmm7
867; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
868; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
869; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
Roman Lebedev6893b152022-12-13 04:04:42 +0300870; SSE-NEXT: movdqa %xmm1, %xmm9
Jay Foad7b3bbd82023-10-09 12:31:32 +0100871; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
872; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
873; SSE-NEXT: movdqa %xmm2, %xmm10
874; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
Roman Lebedev6893b152022-12-13 04:04:42 +0300875; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
876; SSE-NEXT: movdqa %xmm5, %xmm14
877; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
878; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
879; SSE-NEXT: movdqa %xmm3, %xmm15
Jay Foad7b3bbd82023-10-09 12:31:32 +0100880; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
881; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
882; SSE-NEXT: movdqa %xmm4, %xmm13
883; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
884; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
885; SSE-NEXT: movdqa %xmm6, %xmm12
Roman Lebedev6893b152022-12-13 04:04:42 +0300886; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
Jay Foad7b3bbd82023-10-09 12:31:32 +0100887; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
888; SSE-NEXT: movdqa 112(%rsi), %xmm11
889; SSE-NEXT: movdqa %xmm0, %xmm7
890; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
891; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
Roman Lebedev6893b152022-12-13 04:04:42 +0300892; SSE-NEXT: movdqa %xmm0, 224(%rdx)
Jay Foad7b3bbd82023-10-09 12:31:32 +0100893; SSE-NEXT: movdqa %xmm7, 240(%rdx)
Roman Lebedev6893b152022-12-13 04:04:42 +0300894; SSE-NEXT: movdqa %xmm6, 192(%rdx)
Jay Foad7b3bbd82023-10-09 12:31:32 +0100895; SSE-NEXT: movdqa %xmm12, 208(%rdx)
Roman Lebedev6893b152022-12-13 04:04:42 +0300896; SSE-NEXT: movdqa %xmm4, 160(%rdx)
Jay Foad7b3bbd82023-10-09 12:31:32 +0100897; SSE-NEXT: movdqa %xmm13, 176(%rdx)
Roman Lebedev6893b152022-12-13 04:04:42 +0300898; SSE-NEXT: movdqa %xmm3, 128(%rdx)
899; SSE-NEXT: movdqa %xmm15, 144(%rdx)
900; SSE-NEXT: movdqa %xmm5, 96(%rdx)
901; SSE-NEXT: movdqa %xmm14, 112(%rdx)
902; SSE-NEXT: movdqa %xmm2, 64(%rdx)
Jay Foad7b3bbd82023-10-09 12:31:32 +0100903; SSE-NEXT: movdqa %xmm10, 80(%rdx)
Roman Lebedev6893b152022-12-13 04:04:42 +0300904; SSE-NEXT: movdqa %xmm1, 32(%rdx)
905; SSE-NEXT: movdqa %xmm9, 48(%rdx)
Jay Foad7b3bbd82023-10-09 12:31:32 +0100906; SSE-NEXT: movdqa %xmm8, (%rdx)
Roman Lebedev6893b152022-12-13 04:04:42 +0300907; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
908; SSE-NEXT: movaps %xmm0, 16(%rdx)
909; SSE-NEXT: retq
910;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800911; AVX-LABEL: store_i16_stride2_vf64:
912; AVX: # %bb.0:
913; AVX-NEXT: vmovdqa 64(%rsi), %xmm1
914; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
915; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
916; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
917; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
918; AVX-NEXT: vmovdqa 80(%rsi), %xmm3
919; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
920; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
921; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
922; AVX-NEXT: vmovdqa (%rsi), %xmm4
923; AVX-NEXT: vmovdqa 16(%rsi), %xmm5
924; AVX-NEXT: vmovdqa 32(%rsi), %xmm6
925; AVX-NEXT: vmovdqa 48(%rsi), %xmm7
926; AVX-NEXT: vmovdqa (%rdi), %xmm8
927; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
928; AVX-NEXT: vmovdqa 32(%rdi), %xmm10
929; AVX-NEXT: vmovdqa 48(%rdi), %xmm11
930; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
931; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
932; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
933; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
934; AVX-NEXT: vmovdqa 96(%rsi), %xmm10
935; AVX-NEXT: vmovdqa 96(%rdi), %xmm13
936; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
937; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
938; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
939; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
940; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
941; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
942; AVX-NEXT: vmovdqa 112(%rsi), %xmm9
943; AVX-NEXT: vmovdqa 112(%rdi), %xmm15
944; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
945; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
946; AVX-NEXT: vmovdqa %xmm9, 224(%rdx)
947; AVX-NEXT: vmovdqa %xmm0, 240(%rdx)
948; AVX-NEXT: vmovdqa %xmm5, 32(%rdx)
949; AVX-NEXT: vmovdqa %xmm11, 48(%rdx)
950; AVX-NEXT: vmovdqa %xmm7, 96(%rdx)
951; AVX-NEXT: vmovdqa %xmm13, 112(%rdx)
952; AVX-NEXT: vmovdqa %xmm10, 192(%rdx)
953; AVX-NEXT: vmovdqa %xmm14, 208(%rdx)
954; AVX-NEXT: vmovdqa %xmm6, 64(%rdx)
955; AVX-NEXT: vmovdqa %xmm8, 80(%rdx)
956; AVX-NEXT: vmovdqa %xmm4, (%rdx)
957; AVX-NEXT: vmovdqa %xmm12, 16(%rdx)
958; AVX-NEXT: vmovdqa %xmm3, 160(%rdx)
959; AVX-NEXT: vmovdqa %xmm2, 176(%rdx)
960; AVX-NEXT: vmovdqa %xmm1, 128(%rdx)
961; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
962; AVX-NEXT: vmovaps %xmm0, 144(%rdx)
963; AVX-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +0300964;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +0800965; AVX2-LABEL: store_i16_stride2_vf64:
966; AVX2: # %bb.0:
967; AVX2-NEXT: vmovdqa (%rdi), %ymm0
968; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
969; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
970; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
971; AVX2-NEXT: vmovdqa (%rsi), %ymm4
972; AVX2-NEXT: vmovdqa 32(%rsi), %ymm5
973; AVX2-NEXT: vmovdqa 64(%rsi), %ymm6
974; AVX2-NEXT: vmovdqa 96(%rsi), %ymm7
975; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
976; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
977; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
978; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
979; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
980; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
981; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
982; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
983; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
984; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
985; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
986; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
987; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
988; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
989; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
990; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
991; AVX2-NEXT: vmovdqa %ymm3, 192(%rdx)
992; AVX2-NEXT: vmovdqa %ymm7, 224(%rdx)
993; AVX2-NEXT: vmovdqa %ymm2, 128(%rdx)
994; AVX2-NEXT: vmovdqa %ymm6, 160(%rdx)
995; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx)
996; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx)
997; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
998; AVX2-NEXT: vmovdqa %ymm4, 32(%rdx)
999; AVX2-NEXT: vzeroupper
1000; AVX2-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +03001001;
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001002; AVX2-FP-LABEL: store_i16_stride2_vf64:
1003; AVX2-FP: # %bb.0:
1004; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
1005; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
1006; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2
1007; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3
1008; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4
1009; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm5
1010; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm6
1011; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm7
1012; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1013; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1014; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
1015; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
1016; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1017; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1018; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
1019; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
1020; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
1021; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
1022; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
1023; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
1024; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
1025; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
1026; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
1027; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
1028; AVX2-FP-NEXT: vmovdqa %ymm3, 192(%rdx)
1029; AVX2-FP-NEXT: vmovdqa %ymm7, 224(%rdx)
1030; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rdx)
1031; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rdx)
1032; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx)
1033; AVX2-FP-NEXT: vmovdqa %ymm5, 96(%rdx)
1034; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
1035; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rdx)
1036; AVX2-FP-NEXT: vzeroupper
1037; AVX2-FP-NEXT: retq
1038;
1039; AVX2-FCP-LABEL: store_i16_stride2_vf64:
1040; AVX2-FCP: # %bb.0:
1041; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
1042; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1043; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
1044; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3
1045; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4
1046; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
1047; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm6
1048; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm7
1049; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1050; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1051; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
1052; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
1053; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1054; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1055; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
1056; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
1057; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
1058; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
1059; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
1060; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
1061; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
1062; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
1063; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
1064; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
1065; AVX2-FCP-NEXT: vmovdqa %ymm3, 192(%rdx)
1066; AVX2-FCP-NEXT: vmovdqa %ymm7, 224(%rdx)
1067; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rdx)
1068; AVX2-FCP-NEXT: vmovdqa %ymm6, 160(%rdx)
1069; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx)
1070; AVX2-FCP-NEXT: vmovdqa %ymm5, 96(%rdx)
1071; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
1072; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rdx)
1073; AVX2-FCP-NEXT: vzeroupper
1074; AVX2-FCP-NEXT: retq
1075;
1076; AVX512-LABEL: store_i16_stride2_vf64:
1077; AVX512: # %bb.0:
1078; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1
1079; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
1080; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1081; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
1082; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1083; AVX512-NEXT: vmovdqa 80(%rsi), %xmm3
1084; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
1085; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1086; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1087; AVX512-NEXT: vmovdqa 96(%rsi), %xmm5
1088; AVX512-NEXT: vmovdqa 96(%rdi), %xmm6
1089; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1090; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1091; AVX512-NEXT: vmovdqa 112(%rsi), %xmm6
1092; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7
1093; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1094; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1095; AVX512-NEXT: vmovdqa (%rsi), %xmm7
1096; AVX512-NEXT: vmovdqa 16(%rsi), %xmm9
1097; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
1098; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11
1099; AVX512-NEXT: vmovdqa (%rdi), %xmm12
1100; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13
1101; AVX512-NEXT: vmovdqa 48(%rdi), %xmm14
1102; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1103; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1104; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1105; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1106; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1107; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1108; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
1109; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1110; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1111; AVX512-NEXT: vmovdqa %xmm9, 48(%rdx)
1112; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
1113; AVX512-NEXT: vmovdqa %xmm7, 16(%rdx)
1114; AVX512-NEXT: vmovdqa %xmm14, (%rdx)
1115; AVX512-NEXT: vmovdqa %xmm11, 112(%rdx)
1116; AVX512-NEXT: vmovdqa %xmm13, 96(%rdx)
1117; AVX512-NEXT: vmovdqa %xmm10, 80(%rdx)
1118; AVX512-NEXT: vmovdqa %xmm15, 64(%rdx)
1119; AVX512-NEXT: vmovdqa %xmm6, 240(%rdx)
1120; AVX512-NEXT: vmovdqa %xmm8, 224(%rdx)
1121; AVX512-NEXT: vmovdqa %xmm5, 208(%rdx)
1122; AVX512-NEXT: vmovdqa %xmm4, 192(%rdx)
1123; AVX512-NEXT: vmovdqa %xmm3, 176(%rdx)
1124; AVX512-NEXT: vmovdqa %xmm2, 160(%rdx)
1125; AVX512-NEXT: vmovdqa %xmm1, 144(%rdx)
1126; AVX512-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1127; AVX512-NEXT: retq
1128;
1129; AVX512-FCP-LABEL: store_i16_stride2_vf64:
1130; AVX512-FCP: # %bb.0:
1131; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
1132; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
1133; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1134; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1135; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1136; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
1137; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1138; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1139; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1140; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
1141; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
1142; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1143; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1144; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
1145; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
1146; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1147; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1148; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7
1149; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
1150; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
1151; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
1152; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12
1153; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
1154; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
1155; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1156; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1157; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1158; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1159; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1160; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1161; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
1162; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1163; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1164; AVX512-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
1165; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
1166; AVX512-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
1167; AVX512-FCP-NEXT: vmovdqa %xmm14, (%rdx)
1168; AVX512-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
1169; AVX512-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
1170; AVX512-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
1171; AVX512-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
1172; AVX512-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
1173; AVX512-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
1174; AVX512-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
1175; AVX512-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
1176; AVX512-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
1177; AVX512-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
1178; AVX512-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
1179; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1180; AVX512-FCP-NEXT: retq
1181;
1182; AVX512DQ-LABEL: store_i16_stride2_vf64:
1183; AVX512DQ: # %bb.0:
1184; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1
1185; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
1186; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1187; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
1188; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1189; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm3
1190; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4
1191; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1192; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1193; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm5
1194; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm6
1195; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1196; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1197; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm6
1198; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7
1199; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1200; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1201; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7
1202; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm9
1203; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
1204; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11
1205; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12
1206; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13
1207; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm14
1208; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1209; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1210; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1211; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1212; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1213; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1214; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
1215; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1216; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1217; AVX512DQ-NEXT: vmovdqa %xmm9, 48(%rdx)
1218; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rdx)
1219; AVX512DQ-NEXT: vmovdqa %xmm7, 16(%rdx)
1220; AVX512DQ-NEXT: vmovdqa %xmm14, (%rdx)
1221; AVX512DQ-NEXT: vmovdqa %xmm11, 112(%rdx)
1222; AVX512DQ-NEXT: vmovdqa %xmm13, 96(%rdx)
1223; AVX512DQ-NEXT: vmovdqa %xmm10, 80(%rdx)
1224; AVX512DQ-NEXT: vmovdqa %xmm15, 64(%rdx)
1225; AVX512DQ-NEXT: vmovdqa %xmm6, 240(%rdx)
1226; AVX512DQ-NEXT: vmovdqa %xmm8, 224(%rdx)
1227; AVX512DQ-NEXT: vmovdqa %xmm5, 208(%rdx)
1228; AVX512DQ-NEXT: vmovdqa %xmm4, 192(%rdx)
1229; AVX512DQ-NEXT: vmovdqa %xmm3, 176(%rdx)
1230; AVX512DQ-NEXT: vmovdqa %xmm2, 160(%rdx)
1231; AVX512DQ-NEXT: vmovdqa %xmm1, 144(%rdx)
1232; AVX512DQ-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1233; AVX512DQ-NEXT: retq
1234;
1235; AVX512DQ-FCP-LABEL: store_i16_stride2_vf64:
1236; AVX512DQ-FCP: # %bb.0:
1237; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
1238; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
1239; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1240; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1241; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1242; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
1243; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1244; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1245; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1246; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
1247; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
1248; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1249; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1250; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
1251; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
1252; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1253; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1254; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7
1255; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
1256; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
1257; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
1258; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12
1259; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
1260; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
1261; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1262; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1263; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1264; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1265; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1266; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1267; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
1268; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1269; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1270; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
1271; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
1272; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
1273; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%rdx)
1274; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
1275; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
1276; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
1277; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
1278; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
1279; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
1280; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
1281; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
1282; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
1283; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
1284; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
1285; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1286; AVX512DQ-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +03001287;
1288; AVX512BW-LABEL: store_i16_stride2_vf64:
1289; AVX512BW: # %bb.0:
1290; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1291; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1292; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
1293; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001294; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Roman Lebedev6893b152022-12-13 04:04:42 +03001295; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
1296; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001297; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Roman Lebedev6893b152022-12-13 04:04:42 +03001298; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1299; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1300; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1301; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1302; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1303; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1304; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1305; AVX512BW-NEXT: vzeroupper
1306; AVX512BW-NEXT: retq
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001307;
1308; AVX512BW-FCP-LABEL: store_i16_stride2_vf64:
1309; AVX512BW-FCP: # %bb.0:
1310; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1311; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1312; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1313; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001314; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001315; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
1316; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001317; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001318; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1319; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1320; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1321; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1322; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1323; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1324; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1325; AVX512BW-FCP-NEXT: vzeroupper
1326; AVX512BW-FCP-NEXT: retq
1327;
1328; AVX512DQ-BW-LABEL: store_i16_stride2_vf64:
1329; AVX512DQ-BW: # %bb.0:
1330; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1331; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1332; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2
1333; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001334; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001335; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5
1336; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001337; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001338; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1339; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1340; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1341; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1342; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1343; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1344; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1345; AVX512DQ-BW-NEXT: vzeroupper
1346; AVX512DQ-BW-NEXT: retq
1347;
1348; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf64:
1349; AVX512DQ-BW-FCP: # %bb.0:
1350; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1351; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001354; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
1356; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
Simon Pilgrimb5d35fe2024-02-02 11:28:58 +00001357; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
Shengchen Kan60dbb2c2024-01-29 00:09:09 +08001358; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1359; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1360; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1361; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1363; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1364; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1365; AVX512DQ-BW-FCP-NEXT: vzeroupper
1366; AVX512DQ-BW-FCP-NEXT: retq
Roman Lebedev6893b152022-12-13 04:04:42 +03001367 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64
1368 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64
1369 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1370 %interleaved.vec = shufflevector <128 x i16> %1, <128 x i16> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
1371 store <128 x i16> %interleaved.vec, ptr %out.vec, align 64
1372 ret void
1373}