blob: 8ac883d1e46ab9c2462c17d1fbaccbb5a03a997b [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
; FIXME: All cases here should be fixed by PR34380
define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3]
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
ret <16 x i16> %res
}
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
ret <16 x i16> %res
}
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
%res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
ret <8 x i16> %res
}
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
ret <8 x i16> %res
}
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <4,0,3,2,u,u,u,u>
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <4,0,3,2,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <4,0,3,2,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,7,3,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <3,0,7,3,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <5,3,2,5,u,u,u,u>
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,3,2,5,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,3,2,5,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],mem[0,0]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],mem[0,0]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],mem[0,0]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,2,3]
; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3]
; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %vp
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14]
; CHECK-NEXT: vpermi2d %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
ret <8 x i32> %res
}
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
ret <8 x i32> %res
}
define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
%res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u>
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [15,5,3,2,15,5,7,6]
; CHECK-NEXT: vpermi2d (%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u>
; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
; CHECK-NEXT: vpextrd $3, %xmm1, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
; CHECK-NEXT: vpextrd $2, %xmm0, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
ret <4 x i32> %res
}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
; CHECK-NEXT: vmovd %xmm2, %eax
; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
; CHECK-NEXT: vpextrd $3, %xmm3, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
; CHECK-NEXT: vpextrd $2, %xmm2, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
ret <4 x i32> %res
}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vmovd %xmm1, %eax
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
; CHECK-NEXT: vpextrd $3, %xmm2, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
; CHECK-NEXT: vpextrd $2, %xmm1, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector