| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7 |
| |
| define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor2_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0 |
| ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512DQ-NEXT: vpmovd2m %xmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor2_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 |
| ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1> |
| %data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison) |
| %data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <4 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-SLOW-LABEL: mask_replication_factor2_vf4: |
| ; AVX512F-SLOW: # %bb.0: |
| ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] |
| ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-SLOW-NEXT: vzeroupper |
| ; AVX512F-SLOW-NEXT: retq |
| ; |
| ; AVX512F-FAST-LABEL: mask_replication_factor2_vf4: |
| ; AVX512F-FAST: # %bb.0: |
| ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] |
| ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-FAST-NEXT: vzeroupper |
| ; AVX512F-FAST-NEXT: retq |
| ; |
| ; AVX512DQ-SLOW-LABEL: mask_replication_factor2_vf4: |
| ; AVX512DQ-SLOW: # %bb.0: |
| ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-SLOW-NEXT: vzeroupper |
| ; AVX512DQ-SLOW-NEXT: retq |
| ; |
| ; AVX512DQ-FAST-LABEL: mask_replication_factor2_vf4: |
| ; AVX512DQ-FAST: # %bb.0: |
| ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] |
| ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1 |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-FAST-NEXT: vzeroupper |
| ; AVX512DQ-FAST-NEXT: retq |
| ; |
| ; AVX512BW-SLOW-LABEL: mask_replication_factor2_vf4: |
| ; AVX512BW-SLOW: # %bb.0: |
| ; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] |
| ; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-SLOW-NEXT: vzeroupper |
| ; AVX512BW-SLOW-NEXT: retq |
| ; |
| ; AVX512BW-FAST-LABEL: mask_replication_factor2_vf4: |
| ; AVX512BW-FAST: # %bb.0: |
| ; AVX512BW-FAST-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] |
| ; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-FAST-NEXT: vzeroupper |
| ; AVX512BW-FAST-NEXT: retq |
| ; |
| ; AVX512VBMI-SLOW-LABEL: mask_replication_factor2_vf4: |
| ; AVX512VBMI-SLOW: # %bb.0: |
| ; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| ; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] |
| ; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512VBMI-SLOW-NEXT: vzeroupper |
| ; AVX512VBMI-SLOW-NEXT: retq |
| ; |
| ; AVX512VBMI-FAST-LABEL: mask_replication_factor2_vf4: |
| ; AVX512VBMI-FAST: # %bb.0: |
| ; AVX512VBMI-FAST-NEXT: kmovq (%rdi), %k1 |
| ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] |
| ; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512VBMI-FAST-NEXT: vzeroupper |
| ; AVX512VBMI-FAST-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> |
| %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison) |
| %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <8 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor2_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor2_vf8: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> |
| %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) |
| store <16 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor2_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor2_vf16: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> |
| %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) |
| store <32 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor2_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31> |
| %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) |
| store <64 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5 |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor2_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k5 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 |
| ; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] |
| ; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] |
| ; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63> |
| %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) |
| store <128 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0 |
| ; AVX512F-ONLY-NEXT: movb $63, %al |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX512F-ONLY-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u> |
| ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512DQ-NEXT: movb $63, %al |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX512DQ-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor3_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u> |
| ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 |
| ; AVX512BW-NEXT: movb $63, %al |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| ; AVX512BW-NEXT: vmovq %xmm1, 16(%rdx) |
| ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1> |
| %data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison) |
| %data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <6 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf4: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u> |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor3_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u> |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3> |
| %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison) |
| %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <12 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor3_vf8: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u> |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> |
| %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison) |
| %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <24 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf16: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k1, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf16: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15> |
| %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison) |
| store <48 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k1 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor3_vf32: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovd (%rdi), %k0 |
| ; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k2 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k5 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $2, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k6 |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 |
| ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovq %k7, %k2 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 |
| ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 |
| ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k4, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k2, %k0 |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31> |
| %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison) |
| store <96 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor3_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor3_vf64: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k2 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $2, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovq %k3, %k5 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $3, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $4, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k6 |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 |
| ; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $62, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $63, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 |
| ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $42, %k0, %k3 |
| ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k0, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $26, %k0, %k3 |
| ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrq $31, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $22, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrq $25, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $16, %k0, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $20, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrq $10, %k0, %k3 |
| ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $15, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k2, %k0 |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63> |
| %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) |
| store <192 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2: |
| ; AVX512F-SLOW: # %bb.0: |
| ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] |
| ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-SLOW-NEXT: vzeroupper |
| ; AVX512F-SLOW-NEXT: retq |
| ; |
| ; AVX512F-FAST-LABEL: mask_replication_factor4_vf2: |
| ; AVX512F-FAST: # %bb.0: |
| ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] |
| ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-FAST-NEXT: vzeroupper |
| ; AVX512F-FAST-NEXT: retq |
| ; |
| ; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2: |
| ; AVX512DQ-SLOW: # %bb.0: |
| ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-SLOW-NEXT: vzeroupper |
| ; AVX512DQ-SLOW-NEXT: retq |
| ; |
| ; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2: |
| ; AVX512DQ-FAST: # %bb.0: |
| ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] |
| ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1 |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-FAST-NEXT: vzeroupper |
| ; AVX512DQ-FAST-NEXT: retq |
| ; |
| ; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2: |
| ; AVX512BW-SLOW: # %bb.0: |
| ; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] |
| ; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-SLOW-NEXT: vzeroupper |
| ; AVX512BW-SLOW-NEXT: retq |
| ; |
| ; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2: |
| ; AVX512BW-FAST: # %bb.0: |
| ; AVX512BW-FAST-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] |
| ; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-FAST-NEXT: vzeroupper |
| ; AVX512BW-FAST-NEXT: retq |
| ; |
| ; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2: |
| ; AVX512VBMI-SLOW: # %bb.0: |
| ; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] |
| ; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] |
| ; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512VBMI-SLOW-NEXT: vzeroupper |
| ; AVX512VBMI-SLOW-NEXT: retq |
| ; |
| ; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2: |
| ; AVX512VBMI-FAST: # %bb.0: |
| ; AVX512VBMI-FAST-NEXT: kmovq (%rdi), %k1 |
| ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] |
| ; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} |
| ; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512VBMI-FAST-NEXT: vzeroupper |
| ; AVX512VBMI-FAST-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> |
| %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison) |
| %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <8 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor4_vf4: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor4_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> |
| %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) |
| store <16 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor4_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor4_vf8: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> |
| %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) |
| store <32 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor4_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15> |
| %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) |
| store <64 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor4_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31> |
| %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) |
| store <128 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm12 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm13 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm14 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm15 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor4_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm12 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm13 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm14 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm15 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm2 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63> |
| %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison) |
| store <256 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1 |
| ; AVX512F-ONLY-NEXT: vmovq %xmm1, 32(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u> |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 |
| ; AVX512DQ-NEXT: vmovq %xmm1, 32(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor5_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u> |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 |
| ; AVX512BW-NEXT: vmovq %xmm1, 32(%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison) |
| %data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <10 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: movw $15, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf4: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] |
| ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; AVX512DQ-NEXT: movw $15, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor5_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u> |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa %xmm0, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3> |
| %data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison) |
| %data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <20 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 |
| ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] |
| ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF |
| ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 |
| ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF |
| ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7> |
| %data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison) |
| %data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <40 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor5_vf16: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 |
| ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 |
| ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15> |
| %data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison) |
| store <80 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor5_vf32: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovd (%rdi), %k0 |
| ; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k6 |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k7 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k3 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $30, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $31, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $25, %k0, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $26, %k0, %k7 |
| ; AVX512BW-NEXT: kmovq %k0, %k4 |
| ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $27, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $22, %k0, %k3 |
| ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrd $23, %k0, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k6, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kandw %k3, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $18, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 |
| ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 |
| ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kandw %k7, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrd $11, %k0, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kandw %k3, %k4, %k4 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k4, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} |
| ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 |
| ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k4, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31> |
| %data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison) |
| store <160 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm13 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm14 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm15 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm16 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm17 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm18 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm19 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1152(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor5_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm13 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm14 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm15 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm16 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm17 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm18 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm19 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1152(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor5_vf64: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k5 |
| ; AVX512BW-NEXT: kshiftrq $1, %k5, %k0 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k2, %k0 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrq $2, %k5, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k7 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k7, %k7 |
| ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k7, %k7 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k6 |
| ; AVX512BW-NEXT: kandw %k6, %k7, %k7 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $3, %k5, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $5, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $6, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $7, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $10, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $11, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrq $13, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $14, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $23, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $24, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $39, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $40, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $56, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63> |
| %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison) |
| store <320 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor6_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u> |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor6_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u> |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison) |
| %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <12 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4: |
| ; AVX512F-SLOW: # %bb.0: |
| ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] |
| ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] |
| ; AVX512F-SLOW-NEXT: vpslld $31, %zmm1, %zmm1 |
| ; AVX512F-SLOW-NEXT: movw $255, %ax |
| ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} |
| ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512F-SLOW-NEXT: vzeroupper |
| ; AVX512F-SLOW-NEXT: retq |
| ; |
| ; AVX512F-FAST-LABEL: mask_replication_factor6_vf4: |
| ; AVX512F-FAST: # %bb.0: |
| ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] |
| ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 |
| ; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1 |
| ; AVX512F-FAST-NEXT: movw $255, %ax |
| ; AVX512F-FAST-NEXT: kmovw %eax, %k1 |
| ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} |
| ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512F-FAST-NEXT: vzeroupper |
| ; AVX512F-FAST-NEXT: retq |
| ; |
| ; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4: |
| ; AVX512DQ-SLOW: # %bb.0: |
| ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] |
| ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] |
| ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; AVX512DQ-SLOW-NEXT: movw $255, %ax |
| ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512DQ-SLOW-NEXT: vzeroupper |
| ; AVX512DQ-SLOW-NEXT: retq |
| ; |
| ; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4: |
| ; AVX512DQ-FAST: # %bb.0: |
| ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] |
| ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 |
| ; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; AVX512DQ-FAST-NEXT: movw $255, %ax |
| ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512DQ-FAST-NEXT: vzeroupper |
| ; AVX512DQ-FAST-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor6_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u> |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> |
| %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison) |
| %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <24 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor6_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf8: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k1, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf8: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> |
| %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison) |
| store <48 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor6_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor6_vf16: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 |
| ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 |
| ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 |
| ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> |
| %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison) |
| store <96 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor6_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor6_vf32: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovd (%rdi), %k5 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k7 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k6 |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 |
| ; AVX512BW-NEXT: kmovq %k2, %k4 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $29, %k5, %k0 |
| ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k3, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $14, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k1, %k3, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $26, %k5, %k3 |
| ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 |
| ; AVX512BW-NEXT: kmovq %k5, %k3 |
| ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k6, %k5 |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $28, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k5, %k4 |
| ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k2, %k4, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k4, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $25, %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 |
| ; AVX512BW-NEXT: kmovq %k0, %k1 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $21, %k2, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 |
| ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k5, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k5, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k5, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 |
| ; AVX512BW-NEXT: kmovq %k2, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k4 |
| ; AVX512BW-NEXT: kshiftrd $23, %k6, %k5 |
| ; AVX512BW-NEXT: kmovq %k6, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k3, %k4, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} |
| ; AVX512BW-NEXT: kmovq %k7, %k4 |
| ; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 |
| ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 |
| ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k6, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 |
| ; AVX512BW-NEXT: kmovq %k0, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k4 |
| ; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k3, %k4, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k3} {z} |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 |
| ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrd $12, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $9, %k3, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 |
| ; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k4 |
| ; AVX512BW-NEXT: kshiftrd $7, %k1, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k3, %k4, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $4, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k3, %k0 |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> |
| %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) |
| store <192 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm15 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm16 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm17 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm18 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm19 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm20 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm21 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm22 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm23 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1408(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1344(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1280(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1216(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1152(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor6_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm15 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm16 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm17 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm18 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm19 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm20 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm21 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm22 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm23 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1408(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1344(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1280(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1216(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor6_vf64: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k5 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $1, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrq $2, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k7 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k7, %k7 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k0, %k6, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k7 |
| ; AVX512BW-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $5, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $7, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $9, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $10, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $11, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $12, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $13, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k2, %k7 |
| ; AVX512BW-NEXT: kshiftrq $14, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $15, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftrq $19, %k3, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k7, %k4 |
| ; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $22, %k4, %k1 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $23, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $26, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $27, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $28, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $29, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $30, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kshiftrq $35, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $39, %k3, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftrq $40, %k3, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $41, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $42, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $43, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $46, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $50, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k2, %k7 |
| ; AVX512BW-NEXT: kshiftrq $51, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k4 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kshiftrq $54, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> |
| %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison) |
| store <384 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1 |
| ; AVX512F-ONLY-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor7_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u> |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 |
| ; AVX512DQ-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor7_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u> |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) |
| ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 |
| ; AVX512BW-NEXT: vmovq %xmm1, 48(%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison) |
| %data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef> |
| store <14 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u> |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor7_vf4: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u> |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor7_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u> |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 96(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa %ymm1, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> |
| %data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison) |
| %data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <28 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8: |
| ; AVX512F-SLOW: # %bb.0: |
| ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} |
| ; AVX512F-SLOW-NEXT: movw $1, %ax |
| ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] |
| ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] |
| ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} |
| ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) |
| ; AVX512F-SLOW-NEXT: vzeroupper |
| ; AVX512F-SLOW-NEXT: retq |
| ; |
| ; AVX512F-FAST-LABEL: mask_replication_factor7_vf8: |
| ; AVX512F-FAST: # %bb.0: |
| ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-FAST-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} |
| ; AVX512F-FAST-NEXT: movw $1, %ax |
| ; AVX512F-FAST-NEXT: kmovw %eax, %k2 |
| ; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} |
| ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 |
| ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] |
| ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 |
| ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} |
| ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) |
| ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-FAST-NEXT: vzeroupper |
| ; AVX512F-FAST-NEXT: retq |
| ; |
| ; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8: |
| ; AVX512DQ-SLOW: # %bb.0: |
| ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 |
| ; AVX512DQ-SLOW-NEXT: movw $1, %ax |
| ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 |
| ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] |
| ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] |
| ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4 |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) |
| ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-SLOW-NEXT: vzeroupper |
| ; AVX512DQ-SLOW-NEXT: retq |
| ; |
| ; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8: |
| ; AVX512DQ-FAST: # %bb.0: |
| ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 |
| ; AVX512DQ-FAST-NEXT: movw $1, %ax |
| ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 |
| ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] |
| ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 |
| ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) |
| ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-FAST-NEXT: vzeroupper |
| ; AVX512DQ-FAST-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u] |
| ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF |
| ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 |
| ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u> |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF |
| ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> |
| %data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison) |
| %data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> |
| store <56 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor7_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf16: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k2 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k2, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512BW-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm0, %zmm6, %zmm6 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512BW-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf16: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k2 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k2, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm0, %zmm6, %zmm6 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512VBMI-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> |
| %data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison) |
| store <112 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor7_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor7_vf32: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovd (%rdi), %k6 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kandw %k0, %k6, %k1 |
| ; AVX512BW-NEXT: kmovq %k0, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovq %k2, %k3 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k7 |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq %k6, %k2 |
| ; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovq %k4, %k6 |
| ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 |
| ; AVX512BW-NEXT: kmovq %k2, %k4 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k3, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $27, %k2, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k7 |
| ; AVX512BW-NEXT: kshiftrd $28, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k7, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k6, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k5 |
| ; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $23, %k3, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrd $22, %k3, %k5 |
| ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovq %k3, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 |
| ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 |
| ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k5 |
| ; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k2, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $18, %k4, %k2 |
| ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 |
| ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k6 |
| ; AVX512BW-NEXT: kshiftrd $19, %k4, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k6, %k5 |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k3, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k3 |
| ; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k3, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: korw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 |
| ; AVX512BW-NEXT: kmovq %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k3, %k3 |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k5 |
| ; AVX512BW-NEXT: kshiftrd $15, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $11, %k3, %k6 |
| ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k6 |
| ; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 |
| ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k6, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k5 |
| ; AVX512BW-NEXT: kshiftrd $10, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k4, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k5, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $6, %k4, %k5 |
| ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovq %k4, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftrd $8, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $4, %k6, %k1 |
| ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k5 |
| ; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k3, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k5, %k2 |
| ; AVX512BW-NEXT: kandw %k4, %k2, %k5 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k5, %k5 |
| ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k5, %k3 |
| ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z} |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $11, %k4, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k2, %k3 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftrd $3, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $10, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k3, %k2 |
| ; AVX512BW-NEXT: kandw %k1, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k7, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k2, %k2 |
| ; AVX512BW-NEXT: kandw %k6, %k2, %k2 |
| ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> |
| %data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison) |
| store <224 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: movw $1, %ax |
| ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm15, %zmm16 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm17 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm18 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm19 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm20 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm21 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm22 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm23 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm24 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm25 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm26 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm27 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm15, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1728(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1664(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1600(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1536(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1472(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1408(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1216(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1152(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor7_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: movw $1, %ax |
| ; AVX512DQ-NEXT: kmovw %eax, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm15, %zmm16 |
| ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm17 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm18 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm19 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm20 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm21 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm22 |
| ; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm23 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm24 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm25 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm26 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm27 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm5, %zmm15, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1728(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1664(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1600(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1536(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1472(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1408(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1216(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1152(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor7_vf64: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k4 |
| ; AVX512BW-NEXT: movw $-3, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k0 |
| ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k0, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-5, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-9, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-17, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-33, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-65, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-129, %ax |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $1, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq %k2, %k5 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k3 |
| ; AVX512BW-NEXT: korw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k2 |
| ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF |
| ; AVX512BW-NEXT: kmovd %eax, %k1 |
| ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k0, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $4, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k4 |
| ; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $11, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $13, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $14, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $15, %k4, %k1 |
| ; AVX512BW-NEXT: kmovq %k4, %k3 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} |
| ; AVX512BW-NEXT: kmovq %k3, %k2 |
| ; AVX512BW-NEXT: kshiftrq $16, %k3, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $17, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k7, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $18, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $19, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $20, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $21, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $22, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $23, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $24, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $25, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $26, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $27, %k4, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $28, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $29, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $30, %k4, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $31, %k4, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k4, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $33, %k4, %k0 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $37, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k5, %k7 |
| ; AVX512BW-NEXT: kshiftrq $42, %k5, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $43, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $44, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $46, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $47, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k4 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $48, %k4, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $49, %k4, %k0 |
| ; AVX512BW-NEXT: kmovq %k4, %k7 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 |
| ; AVX512BW-NEXT: kmovq %k7, %k3 |
| ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k3, %k7 |
| ; AVX512BW-NEXT: kshiftrq $51, %k3, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 |
| ; AVX512BW-NEXT: kmovq %k7, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrq $57, %k2, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k0, %k7 |
| ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kandw %k4, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq %k2, %k7 |
| ; AVX512BW-NEXT: kshiftrq $58, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 |
| ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k6, %k1 |
| ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload |
| ; AVX512BW-NEXT: kshiftrq $60, %k7, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kandw %k3, %k0, %k6 |
| ; AVX512BW-NEXT: kshiftrq $61, %k7, %k0 |
| ; AVX512BW-NEXT: kmovq %k7, %k2 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kandw %k4, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 |
| ; AVX512BW-NEXT: korw %k7, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 |
| ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 |
| ; AVX512BW-NEXT: korw %k1, %k6, %k6 |
| ; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k1, %k0, %k0 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftrq $62, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 |
| ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k0 |
| ; AVX512BW-NEXT: kshiftrq $63, %k2, %k2 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k1, %k0, %k1 |
| ; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 |
| ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 |
| ; AVX512BW-NEXT: korw %k6, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 |
| ; AVX512BW-NEXT: korw %k5, %k1, %k1 |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 |
| ; AVX512BW-NEXT: korw %k4, %k1, %k1 |
| ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload |
| ; AVX512BW-NEXT: kandw %k3, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 |
| ; AVX512BW-NEXT: korw %k2, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 |
| ; AVX512BW-NEXT: korw %k0, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> |
| %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison) |
| store <448 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf2: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor8_vf2: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k1 |
| ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> |
| %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) |
| store <16 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf4: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor8_vf4: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 |
| ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> |
| %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) |
| store <32 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf8: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8: |
| ; AVX512BW-ONLY: # %bb.0: |
| ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] |
| ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-ONLY-NEXT: vzeroupper |
| ; AVX512BW-ONLY-NEXT: retq |
| ; |
| ; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8: |
| ; AVX512VBMI-ONLY: # %bb.0: |
| ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512VBMI-ONLY-NEXT: vzeroupper |
| ; AVX512VBMI-ONLY-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> |
| %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) |
| store <64 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf16: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor8_vf16: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovw (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] |
| ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 |
| ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] |
| ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> |
| %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) |
| store <128 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm1, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm11, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm13, %zmm13 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm15, %zmm15 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf32: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm11, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm13, %zmm13 |
| ; AVX512DQ-NEXT: vpermd %zmm16, %zmm15, %zmm15 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor8_vf32: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovd (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 |
| ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 |
| ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 |
| ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 |
| ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 |
| ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 |
| ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} |
| ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 |
| %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> |
| %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> |
| %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison) |
| store <256 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { |
| ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64: |
| ; AVX512F-ONLY: # %bb.0: |
| ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm16 |
| ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm4 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm18 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm19 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm20 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm21 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm22 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm23 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm24 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm1 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm25 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm26 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm27 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm28 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm29 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm30 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm15, %zmm31 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm17, %zmm2 |
| ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm3 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm15 |
| ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm0 |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload |
| ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 |
| ; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1920(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1856(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1792(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1728(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1664(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1600(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1472(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1408(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1344(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1280(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 1216(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1152(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 1088(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1024(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 960(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 896(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 832(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 768(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 704(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 640(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 448(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 64(%rdx) |
| ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512F-ONLY-NEXT: vzeroupper |
| ; AVX512F-ONLY-NEXT: retq |
| ; |
| ; AVX512DQ-LABEL: mask_replication_factor8_vf64: |
| ; AVX512DQ: # %bb.0: |
| ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 |
| ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 |
| ; AVX512DQ-NEXT: kmovw (%rdi), %k0 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 |
| ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm16 |
| ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm4 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm18 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm19 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm20 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm21 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm22 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm23 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm24 |
| ; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm1 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm25 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm26 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm27 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm28 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm29 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm30 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm15, %zmm31 |
| ; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm2 |
| ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm3 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm5 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm15 |
| ; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm0 |
| ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} |
| ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload |
| ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 |
| ; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1920(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1856(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1792(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1728(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1664(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1600(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1472(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1408(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1344(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1280(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1216(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1088(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1024(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 960(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 896(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 832(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 768(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 704(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 640(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rdx) |
| ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512DQ-NEXT: vzeroupper |
| ; AVX512DQ-NEXT: retq |
| ; |
| ; AVX512BW-LABEL: mask_replication_factor8_vf64: |
| ; AVX512BW: # %bb.0: |
| ; AVX512BW-NEXT: kmovq (%rdi), %k0 |
| ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 |
| ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm17 |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm14 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm9 |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm4 |
| ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] |
| ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm2 |
| ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 |
| ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k2} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm9, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm15 {%k1} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm14, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm16 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k2} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm17, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm17 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k1} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm12, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k2} {z} |
| ; AVX512BW-NEXT: vpmovb2m %zmm7, %k2 |
| ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k2} {z} |
| ; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} |
| ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 |
| ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} |
| ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm16, 896(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm14, 768(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) |
| ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) |
| ; AVX512BW-NEXT: vzeroupper |
| ; AVX512BW-NEXT: retq |
| %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 |
| %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> |
| %data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison) |
| store <512 x i32> %data, ptr %out.vec, align 64 |
| ret void |
| } |
| |
| declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) |
| declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>) |
| declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) |
| declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>) |
| declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>) |
| declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>) |
| declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) |
| declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>) |
| declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>) |
| declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>) |
| declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>) |
| declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>) |
| declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>) |
| declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>) |
| declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>) |
| declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>) |
| declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>) |
| declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>) |
| declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>) |
| declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>) |
| declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>) |
| declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>) |
| declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>) |
| declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>) |
| declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>) |
| declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>) |
| declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>) |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; AVX512: {{.*}} |
| ; FALLBACK0: {{.*}} |
| ; FALLBACK1: {{.*}} |
| ; FALLBACK2: {{.*}} |
| ; FALLBACK3: {{.*}} |
| ; FALLBACK4: {{.*}} |
| ; FALLBACK5: {{.*}} |
| ; FALLBACK6: {{.*}} |
| ; FALLBACK7: {{.*}} |