[X86][SSE] Cleanup shuffle combining test check prefixes

Share prefixes whenever possible, use X86 instead of X32.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350722 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 03fe8c4..5651174 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX2,X86-AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX2,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512,X64-AVX512
 ;
 ; Combine tests involving AVX target shuffles
 
@@ -23,37 +23,28 @@
 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
 
 define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
   ret <4 x float> %2
 }
 
 define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_movddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_movddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_movddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
   ret <4 x float> %1
 }
 define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
-; X32-LABEL: combine_vpermilvar_4f32_movddup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermilvar_4f32_movddup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermilvar_4f32_movddup_load:
 ; X64:       # %bb.0:
@@ -65,119 +56,75 @@
 }
 
 define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_movshdup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_movshdup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_movshdup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
   ret <4 x float> %1
 }
 
 define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_movsldup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_movsldup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_movsldup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
   ret <4 x float> %1
 }
 
 define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_unpckh:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_unpckh:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_unpckh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
   ret <4 x float> %1
 }
 
 define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_unpckl:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_unpckl:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_unpckl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
   ret <4 x float> %1
 }
 
 define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
   ret <8 x float> %2
 }
 
 define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_10326u4u:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_10326u4u:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_10326u4u:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
   ret <8 x float> %2
 }
 
 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
-; X32-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X32-AVX1-NEXT:    retl
+; AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    ret{{[l|q]}}
 ;
-; X32-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X32-AVX2:       # %bb.0:
-; X32-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X32-AVX2-NEXT:    retl
+; AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    ret{{[l|q]}}
 ;
-; X32-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X32-AVX512:       # %bb.0:
-; X32-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X32-AVX512-NEXT:    retl
-;
-; X64-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-AVX512-NEXT:    retq
+; AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
   %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
@@ -185,15 +132,10 @@
 }
 
 define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
-; X32:       # %bb.0:
-; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
-; X64:       # %bb.0:
-; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
   %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
@@ -201,19 +143,12 @@
 }
 
 define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
-; X32-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X32-NEXT:    vmovapd %xmm0, %xmm0
-; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X64-NEXT:    vmovapd %xmm0, %xmm0
-; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; CHECK-NEXT:    vmovapd %xmm0, %xmm0
+; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
   %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
@@ -221,24 +156,19 @@
 }
 
 define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_movddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_movddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_movddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
   ret <8 x float> %1
 }
 define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
-; X32-LABEL: combine_vpermilvar_8f32_movddup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermilvar_8f32_movddup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermilvar_8f32_movddup_load:
 ; X64:       # %bb.0:
@@ -250,97 +180,64 @@
 }
 
 define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_movshdup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_movshdup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_movshdup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
   ret <8 x float> %1
 }
 
 define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_movsldup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_movsldup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_movsldup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
   ret <8 x float> %1
 }
 
 define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
-; X32-LABEL: combine_vpermilvar_2f64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_2f64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_2f64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
   %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>  %1, <2 x i64> <i64 2, i64 0>)
   ret <2 x double> %2
 }
 
 define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
-; X32-LABEL: combine_vpermilvar_2f64_movddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_2f64_movddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_2f64_movddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
   ret <2 x double> %1
 }
 
 define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
-; X32-LABEL: combine_vpermilvar_4f64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
   %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>  %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
   ret <4 x double> %2
 }
 
 define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
-; X32-LABEL: combine_vpermilvar_4f64_movddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f64_movddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f64_movddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
   ret <4 x double> %1
 }
 
 define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_4stage:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_4stage:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_4stage:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
   %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
   %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>  %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
@@ -349,15 +246,10 @@
 }
 
 define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_8f32_4stage:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_8f32_4stage:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_8f32_4stage:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
   %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
   %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
@@ -366,119 +258,94 @@
 }
 
 define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_4f32_as_insertps:
-; X32:       # %bb.0:
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_4f32_as_insertps:
-; X64:       # %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_4f32_as_insertps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
   %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
   ret <4 x float> %2
 }
 
 define <2 x double> @constant_fold_vpermilvar_pd() {
-; X32-LABEL: constant_fold_vpermilvar_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermilvar_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermilvar_pd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
   ret <2 x double> %1
 }
 
 define <4 x double> @constant_fold_vpermilvar_pd_256() {
-; X32-LABEL: constant_fold_vpermilvar_pd_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermilvar_pd_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermilvar_pd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
   ret <4 x double> %1
 }
 
 define <4 x float> @constant_fold_vpermilvar_ps() {
-; X32-LABEL: constant_fold_vpermilvar_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermilvar_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermilvar_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
   ret <4 x float> %1
 }
 
 define <8 x float> @constant_fold_vpermilvar_ps_256() {
-; X32-LABEL: constant_fold_vpermilvar_ps_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermilvar_ps_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermilvar_ps_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
   ret <8 x float> %1
 }
 
 define void @PR39483() {
-; X32-AVX1-LABEL: PR39483:
-; X32-AVX1:       # %bb.0: # %entry
-; X32-AVX1-NEXT:    vmovups 32, %ymm0
-; X32-AVX1-NEXT:    vmovups 64, %xmm1
-; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X32-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; X32-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; X32-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; X32-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; X32-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX1-LABEL: PR39483:
+; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    vmovups 32, %ymm0
+; X86-AVX1-NEXT:    vmovups 64, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X86-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
 ;
-; X32-AVX2-LABEL: PR39483:
-; X32-AVX2:       # %bb.0: # %entry
-; X32-AVX2-NEXT:    vmovups 32, %ymm0
-; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X32-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
-; X32-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; X32-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
-; X32-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; X32-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX2-LABEL: PR39483:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    vmovups 32, %ymm0
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X86-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovups %ymm0, (%eax)
 ;
-; X32-AVX512-LABEL: PR39483:
-; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vmovups 0, %zmm0
-; X32-AVX512-NEXT:    vmovups 64, %ymm1
-; X32-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
-; X32-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
-; X32-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
-; X32-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; X32-AVX512-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX512-LABEL: PR39483:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    vmovups 0, %zmm0
+; X86-AVX512-NEXT:    vmovups 64, %ymm1
+; X86-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X86-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
+; X86-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
+; X86-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; X86-AVX512-NEXT:    vmovups %ymm0, (%eax)
 ;
 ; X64-AVX1-LABEL: PR39483:
 ; X64-AVX1:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 90fa533..729863c 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
@@ -26,28 +26,24 @@
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_permvar_8f64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_8f64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_8f64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
   %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
   ret <8 x double> %2
 }
 define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
-; X32-LABEL: combine_permvar_8f64_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
-; X32-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
-; X32-NEXT:    vmovapd %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_permvar_8f64_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; X86-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
+; X86-NEXT:    vmovapd %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8f64_identity_mask:
 ; X64:       # %bb.0:
@@ -68,28 +64,24 @@
 }
 
 define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
-; X32-LABEL: combine_permvar_8i64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_8i64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_8i64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
   %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
   ret <8 x i64> %2
 }
 define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
-; X32-LABEL: combine_permvar_8i64_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
-; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_permvar_8i64_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8i64_identity_mask:
 ; X64:       # %bb.0:
@@ -110,27 +102,23 @@
 }
 
 define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermt2var_8f64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_8f64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_8f64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
   ret <8 x double> %res1
 }
 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
-; X32-LABEL: combine_vpermt2var_8f64_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_8f64_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
+; X86-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64:       # %bb.0:
@@ -146,24 +134,19 @@
 }
 
 define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermt2var_8f64_movddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_8f64_movddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
   ret <8 x double> %res0
 }
 define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermt2var_8f64_movddup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_8f64_movddup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
 ; X64:       # %bb.0:
@@ -174,12 +157,12 @@
   ret <8 x double> %res0
 }
 define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
-; X32-LABEL: combine_vpermt2var_8f64_movddup_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_8f64_movddup_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_movddup_mask:
 ; X64:       # %bb.0:
@@ -191,27 +174,23 @@
 }
 
 define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
-; X32-LABEL: combine_vpermt2var_8i64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_8i64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_8i64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
   ret <8 x i64> %res1
 }
 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
-; X32-LABEL: combine_vpermt2var_8i64_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_8i64_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X64:       # %bb.0:
@@ -227,26 +206,22 @@
 }
 
 define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16f32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16f32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
   ret <16 x float> %res1
 }
 define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
+; X86-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64:       # %bb.0:
@@ -262,28 +237,22 @@
 }
 
 define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovddup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16f32_vmovddup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X64-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovaps (%eax), %zmm1
-; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %zmm1
+; X86-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X86-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
 ; X64:       # %bb.0:
@@ -296,12 +265,12 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
 ; X64:       # %bb.0:
@@ -313,14 +282,14 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovaps (%eax), %zmm1
-; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %zmm1
+; X86-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
 ; X64:       # %bb.0:
@@ -335,24 +304,19 @@
 }
 
 define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovshdup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16f32_vmovshdup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
 ; X64:       # %bb.0:
@@ -363,11 +327,11 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
 ; X64:       # %bb.0:
@@ -379,24 +343,19 @@
 }
 
 define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovsldup:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16f32_vmovsldup:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
 ; X64:       # %bb.0:
@@ -407,11 +366,11 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
 ; X64:       # %bb.0:
@@ -422,12 +381,12 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
 ; X64:       # %bb.0:
@@ -440,24 +399,19 @@
 }
 
 define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vpermilps:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16f32_vpermilps:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermt2var_16f32_vpermilps_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
 ; X64:       # %bb.0:
@@ -468,11 +422,11 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
 ; X64:       # %bb.0:
@@ -483,12 +437,12 @@
   ret <16 x float> %res0
 }
 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
 ; X64:       # %bb.0:
@@ -501,26 +455,22 @@
 }
 
 define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
-; X32-LABEL: combine_vpermt2var_16i32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16i32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16i32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
   ret <16 x i32> %res1
 }
 define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16i32_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16i32_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X64:       # %bb.0:
@@ -536,26 +486,22 @@
 }
 
 define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
-; X32-LABEL: combine_vpermt2var_32i16_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_32i16_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_32i16_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
   ret <32 x i16> %res1
 }
 define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
-; X32-LABEL: combine_vpermt2var_32i16_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X32-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_32i16_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X86-NEXT:    vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
 ; X64:       # %bb.0:
@@ -571,13 +517,9 @@
 }
 
 define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
-; X32-LABEL: combine_pshufb_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
   %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
   %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
@@ -585,16 +527,16 @@
   ret <64 x i8> %res1
 }
 define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
-; X32-LABEL: combine_pshufb_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
-; X32-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
-; X32-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
-; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_pshufb_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
+; X86-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
+; X86-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_identity_mask:
 ; X64:       # %bb.0:
@@ -614,68 +556,48 @@
 }
 
 define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
-; X32-LABEL: combine_permvar_as_vpbroadcastw512:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastw %xmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_as_vpbroadcastw512:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_as_vpbroadcastw512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer)
   ret <32 x i16> %1
 }
 
 define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
-; X32-LABEL: combine_permvar_as_vpbroadcastd512:
-; X32:       # %bb.0:
-; X32-NEXT:    vbroadcastss %xmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_as_vpbroadcastd512:
-; X64:       # %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer)
   ret <16 x i32> %1
 }
 
 define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
-; X32-LABEL: combine_permvar_as_vpbroadcastq512:
-; X32:       # %bb.0:
-; X32-NEXT:    vbroadcastsd %xmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_as_vpbroadcastq512:
-; X64:       # %bb.0:
-; X64-NEXT:    vbroadcastsd %xmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer)
   ret <8 x i64> %1
 }
 
 define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
-; X32-LABEL: combine_permvar_8i64_as_permq:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_8i64_as_permq:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_8i64_as_permq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
   ret <8 x i64> %1
 }
 define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
-; X32-LABEL: combine_permvar_8i64_as_permq_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
-; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_permvar_8i64_as_permq_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8i64_as_permq_mask:
 ; X64:       # %bb.0:
@@ -690,26 +612,21 @@
 }
 
 define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_permvar_8f64_as_permpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_8f64_as_permpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_8f64_as_permpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
   ret <8 x double> %1
 }
 define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
-; X32-LABEL: combine_permvar_8f64_as_permpd_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
-; X32-NEXT:    vmovapd %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_permvar_8f64_as_permpd_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; X86-NEXT:    vmovapd %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8f64_as_permpd_mask:
 ; X64:       # %bb.0:
@@ -724,39 +641,29 @@
 }
 
 define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
-; X32-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
   ret <16 x float> %res1
 }
 
 define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
-; X32-LABEL: combine_pshufb_as_pslldq:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_pslldq:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_pslldq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
   ret <64 x i8> %res0
 }
 define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
-; X32-LABEL: combine_pshufb_as_pslldq_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
-; X32-NEXT:    retl
+; X86-LABEL: combine_pshufb_as_pslldq_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_pslldq_mask:
 ; X64:       # %bb.0:
@@ -768,24 +675,19 @@
 }
 
 define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
-; X32-LABEL: combine_pshufb_as_psrldq:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_psrldq:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_psrldq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
   ret <64 x i8> %res0
 }
 define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
-; X32-LABEL: combine_pshufb_as_psrldq_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    retl
+; X86-LABEL: combine_pshufb_as_psrldq_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_psrldq_mask:
 ; X64:       # %bb.0:
@@ -797,47 +699,30 @@
 }
 
 define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
-; X32-LABEL: combine_permvar_as_pshuflw:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_permvar_as_pshuflw:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_permvar_as_pshuflw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
   ret <32 x i16> %1
 }
 
 define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
-; X32-LABEL: combine_pshufb_as_pshufhw:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_pshufhw:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_pshufhw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
   ret <32 x i16> %1
 }
 
 define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
-; X32-LABEL: combine_pshufb_as_packsswb:
-; X32:       # %bb.0:
-; X32-NEXT:    vpsraw $11, %zmm0, %zmm0
-; X32-NEXT:    vpsraw $11, %zmm1, %zmm1
-; X32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_packsswb:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsraw $11, %zmm0, %zmm0
-; X64-NEXT:    vpsraw $11, %zmm1, %zmm1
-; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_packsswb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsraw $11, %zmm0, %zmm0
+; CHECK-NEXT:    vpsraw $11, %zmm1, %zmm1
+; CHECK-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = ashr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = ashr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <32 x i16> %1 to <64 x i8>
@@ -849,19 +734,12 @@
 }
 
 define <64 x i8> @combine_pshufb_as_packuswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
-; X32-LABEL: combine_pshufb_as_packuswb:
-; X32:       # %bb.0:
-; X32-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; X32-NEXT:    vpsrlw $11, %zmm1, %zmm1
-; X32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_packuswb:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; X64-NEXT:    vpsrlw $11, %zmm1, %zmm1
-; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_packuswb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; CHECK-NEXT:    vpsrlw $11, %zmm1, %zmm1
+; CHECK-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = lshr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = lshr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <32 x i16> %1 to <64 x i8>
@@ -873,133 +751,93 @@
 }
 
 define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) {
-; X32-LABEL: combine_vpermi2var_32i16_as_pshufb:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_32i16_as_pshufb:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_32i16_as_pshufb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
   %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %1, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
   ret <32 x i16> %2
 }
 
 define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermi2var_8f64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_8f64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_8f64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
   ret <8 x double> %res1
 }
 
 define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermi2var_8f64_as_shufpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_8f64_as_shufpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_8f64_as_shufpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
   ret <8 x double> %1
 }
 
 define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
-; X32-LABEL: combine_vpermi2var_8i64_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_8i64_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_8i64_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
   ret <8 x i64> %res1
 }
 
 define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermi2var_16f32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_16f32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_16f32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
   ret <16 x float> %res1
 }
 
 define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
-; X32-LABEL: combine_vpermi2var_16i32_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_16i32_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_16i32_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
   ret <16 x i32> %res1
 }
 
 define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
-; X32:       # %bb.0:
-; X32-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
-; X64:       # %bb.0:
-; X64-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
   ret <16 x float> %res0
 }
 
 define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
-; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
-; X32:       # %bb.0:
-; X32-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; X32-NEXT:    retl
-;
-; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
-; X64:       # %bb.0:
-; X64-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; X64-NEXT:    retq
+; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
   ret <16 x i32> %res0
 }
 
 define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
-; X32-LABEL: combine_vpermi2var_32i16_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_32i16_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_32i16_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x1, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, i32 -1)
   ret <32 x i16> %res1
 }
 
 define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermi2var_8f64_as_vpermpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
 ; X64:       # %bb.0:
@@ -1012,11 +850,11 @@
 }
 
 define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
-; X32-LABEL: combine_vpermt2var_8i64_as_vpermq:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_8i64_as_vpermq:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
 ; X64:       # %bb.0:
@@ -1029,63 +867,45 @@
 }
 
 define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
-; X32-LABEL: combine_vpermi2var_16f32_as_vpermps:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X32-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_16f32_as_vpermps:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1)
   ret <16 x float> %res1
 }
 
 define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
-; X32-LABEL: combine_vpermt2var_16i32_as_vpermd:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X32-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16i32_as_vpermd:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
   ret <16 x i32> %res1
 }
 
 define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
-; X32-LABEL: combine_vpermi2var_32i16_as_permw:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
-; X32-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_32i16_as_permw:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
-; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_32i16_as_permw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %x1, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 31, i16 1, i16 30, i16 2, i16 29, i16 3, i16 28, i16 4, i16 27, i16 5, i16 26, i16 6, i16 25, i16 7, i16 24, i16 8, i16 23, i16 9, i16 22, i16 10, i16 21, i16 11, i16 20, i16 12, i16 19, i16 13, i16 18, i16 14, i16 17, i16 15, i16 16>, <32 x i16> %res0, i32 -1)
   ret <32 x i16> %res1
 }
 
 define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
-; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
-; X32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
-; X32-NEXT:    vmovapd %zmm2, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
+; X86-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
+; X86-NEXT:    vmovapd %zmm2, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
 ; X64:       # %bb.0:
@@ -1099,48 +919,35 @@
 }
 
 define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
-; X32-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
-; X32-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
-; X64-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
+; CHECK-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
   ret <16 x i32> %res1
 }
 
 define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
-; X32-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; X32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
-; X64-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
+; CHECK-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 0, i16 63, i16 1, i16 61, i16 2, i16 59, i16 3, i16 57, i16 4, i16 55, i16 5, i16 53, i16 6, i16 51, i16 7, i16 49, i16 8, i16 47, i16 9, i16 45, i16 10, i16 43, i16 11, i16 41, i16 12, i16 39, i16 13, i16 37, i16 14, i16 35, i16 15, i16 33>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %res0, i32 -1)
   ret <32 x i16> %res1
 }
 
 define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
-; X32-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
-; X32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
+; X86-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
 ; X64:       # %bb.0:
@@ -1154,29 +961,22 @@
 }
 
 define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
-; X32-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
-; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
-; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
+; CHECK-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1)
   ret <16 x float> %res1
 }
 
 define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
-; X32-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
-; X32:       # %bb.0:
-; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
 ; X64:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index c4ab922..fd41c9f 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -1,31 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
 
 declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
 declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
 
 define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16> %x1) {
-; X32-LABEL: combine_vpermt2var_16i16_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16i16_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16i16_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 -1)
   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 -1)
   ret <16 x i16> %res1
 }
 define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16i16_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
-; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16i16_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
+; X86-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
 ; X64:       # %bb.0:
@@ -41,63 +37,41 @@
 }
 
 define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) {
-; X32-LABEL: combine_vpermi2var_16i16_as_permw:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
-; X32-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_16i16_as_permw:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
-; X64-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_16i16_as_permw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x1, i16 -1)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %res0, <16 x i16> <i16 0, i16 15, i16 1, i16 14, i16 2, i16 13, i16 3, i16 12, i16 4, i16 11, i16 5, i16 10, i16 6, i16 9, i16 7, i16 8>, <16 x i16> %res0, i16 -1)
   ret <16 x i16> %res1
 }
 
 define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
-; X32-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
-; X64-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
+; CHECK-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 0, i16 31, i16 2, i16 29, i16 4, i16 27, i16 6, i16 25, i16 8, i16 23, i16 10, i16 21, i16 12, i16 19, i16 14, i16 17>, <16 x i16> %x1, i16 -1)
   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 0, i16 17, i16 2, i16 18, i16 4, i16 19, i16 6, i16 21, i16 8, i16 23, i16 10, i16 25, i16 12, i16 27, i16 14, i16 29>, <16 x i16> %res0, <16 x i16> %res0, i16 -1)
   ret <16 x i16> %res1
 }
 
 define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %a0, <16 x i16> <i16 20, i16 4, i16 21, i16 5, i16 22, i16 6, i16 23, i16 7, i16 28, i16 12, i16 29, i16 13, i16 30, i16 14, i16 31, i16 15>, <16 x i16> %a1, i16 -1)
   ret <16 x i16> %res0
 }
 
 define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 0, i16 16, i16 1, i16 17, i16 2, i16 18, i16 3, i16 19, i16 8, i16 24, i16 9, i16 25, i16 10, i16 26, i16 11, i16 27>, <16 x i16> %a0, <16 x i16> %a1, i16 -1)
   ret <16 x i16> %res0
 }
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 5350dda..0e0dfec 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512vbmi,+avx512vl | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vbmi,+avx512vl | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512vbmi,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vbmi,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
 
 declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
 declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
@@ -19,26 +19,22 @@
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <16 x i8> @combine_vpermt2var_16i8_identity(<16 x i8> %x0, <16 x i8> %x1) {
-; X32-LABEL: combine_vpermt2var_16i8_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_16i8_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_16i8_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 -1)
   %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)
   ret <16 x i8> %res1
 }
 define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8> %x1, i16 %m) {
-; X32-LABEL: combine_vpermt2var_16i8_identity_mask:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
-; X32-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X32-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpermt2var_16i8_identity_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
+; X86-NEXT:    vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT:    vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
 ; X64:       # %bb.0:
@@ -54,100 +50,63 @@
 }
 
 define <16 x i8> @combine_vpermi2var_16i8_as_vpshufb(<16 x i8> %x0, <16 x i8> %x1) {
-; X32-LABEL: combine_vpermi2var_16i8_as_vpshufb:
-; X32:       # %bb.0:
-; X32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_16i8_as_vpshufb:
-; X64:       # %bb.0:
-; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_16i8_as_vpshufb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x1, i16 -1)
   %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 15, i8 1, i8 14, i8 2, i8 13, i8 3, i8 12, i8 4, i8 11, i8 5, i8 10, i8 6, i8 9, i8 7, i8 8>, <16 x i8> %res0, i16 -1)
   ret <16 x i8> %res1
 }
 define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) {
-; X32-LABEL: combine_vpermi2var_32i8_as_vpermb:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X32-NEXT:    vpermb %ymm0, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_32i8_as_vpermb:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X64-NEXT:    vpermb %ymm0, %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_32i8_as_vpermb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %res0, <32 x i8> <i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22>, <32 x i8> %res0, i32 -1)
   ret <32 x i8> %res1
 }
 define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) {
-; X32-LABEL: combine_vpermi2var_64i8_as_vpermb:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X32-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_64i8_as_vpermb:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X64-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_64i8_as_vpermb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %res0, <64 x i8> <i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22>, <64 x i8> %res0, i64 -1)
   ret <64 x i8> %res1
 }
 
 define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) {
-; X32-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
-; X32-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2
-; X32-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
-; X32-NEXT:    vpermi2b %xmm2, %xmm2, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
-; X64-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2
-; X64-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
-; X64-NEXT:    vpermi2b %xmm2, %xmm2, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
+; CHECK-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
+; CHECK-NEXT:    vpermi2b %xmm2, %xmm2, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 0, i8 31, i8 2, i8 29, i8 4, i8 27, i8 6, i8 25, i8 8, i8 23, i8 10, i8 21, i8 12, i8 19, i8 14, i8 17>, <16 x i8> %x1, i16 -1)
   %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 0, i8 17, i8 2, i8 18, i8 4, i8 19, i8 6, i8 21, i8 8, i8 23, i8 10, i8 25, i8 12, i8 27, i8 14, i8 29>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)
   ret <16 x i8> %res1
 }
 define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) {
-; X32-LABEL: combine_vpermi2var_32i8_as_vperm2:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X32-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_32i8_as_vperm2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X64-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_32i8_as_vperm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %res0, <32 x i8> <i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22>, <32 x i8> %x1, i32 -1)
   ret <32 x i8> %res1
 }
 define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1) {
-; X32-LABEL: combine_vpermi2var_64i8_as_vperm2:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X32-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermi2var_64i8_as_vperm2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
-; X64-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermi2var_64i8_as_vperm2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %res0, <64 x i8> <i8 0, i8 80, i8 2, i8 70, i8 4, i8 60, i8 6, i8 50, i8 8, i8 40, i8 10, i8 30, i8 12, i8 20, i8 14, i8 10, i8 0, i8 90, i8 2, i8 100, i8 4, i8 110, i8 6, i8 120, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22, i8 0, i8 32, i8 2, i8 30, i8 4, i8 28, i8 6, i8 26, i8 8, i8 28, i8 10, i8 26, i8 12, i8 24, i8 14, i8 22>, <64 x i8> %x1, i64 -1)
   ret <64 x i8> %res1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 27ccdef..60c3c86 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
-;
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512F
+
 ; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX)
 
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
index 1b701f8..7fe6403 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
@@ -1,28 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ;
 ; Combine tests involving SSE4A target shuffles (EXTRQI,INSERTQI)
 
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) {
-; ALL-LABEL: combine_extrqi_pshufb_16i8:
-; ALL:       # %bb.0:
-; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_extrqi_pshufb_16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
   %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   ret <16 x i8> %2
 }
 
 define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) {
-; ALL-LABEL: combine_extrqi_pshufb_8i16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_extrqi_pshufb_8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = bitcast <8 x i16> %1 to <16 x i8>
   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
@@ -75,10 +75,10 @@
 }
 
 define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
-; ALL-LABEL: combine_pshufb_insertqi_pshufb:
-; ALL:       # %bb.0:
-; ALL-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_insertqi_pshufb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    retq
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
   %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index f32f87b..a531bf6 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512F
 ;
 ; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
 
@@ -457,9 +457,9 @@
 }
 
 define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
-; ALL-LABEL: combine_pshufb_as_unpacklo_undef:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>)
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -467,9 +467,9 @@
 }
 
 define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
-; ALL-LABEL: combine_pshufb_as_unpackhi_undef:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufb_as_unpackhi_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>)
   %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
   ret <16 x i8> %2
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 37d13d1..7da6afd 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32 --check-prefix=X86AVX
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X32 --check-prefix=X86AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64 --check-prefix=X64AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X64 --check-prefix=X64AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefixes=CHECK,X86,AVX2,X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefixes=CHECK,X64,AVX2,X64-AVX2
 
 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
@@ -13,211 +13,138 @@
 declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
-; X32-LABEL: combine_vpermil2pd_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2pd_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2pd_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> <i64 2, i64 0>, i8 0)
   %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> <i64 2, i64 0>, i8 0)
   ret <2 x double> %res1
 }
 
 define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
-; X32-LABEL: combine_vpermil2pd256_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2pd256_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2pd256_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
   %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
   ret <4 x double> %res1
 }
 
 define <4 x double> @combine_vpermil2pd256_0z73(<4 x double> %a0, <4 x double> %a1) {
-; X32-LABEL: combine_vpermil2pd256_0z73:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2pd256_0z73:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2pd256_0z73:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 undef, i32 7, i32 3>
   %res1 = shufflevector <4 x double> %res0, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x double> %res1
 }
 
 define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
   %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
   ret <4 x float> %res1
 }
 
 define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps_1z74:
-; X32:       # %bb.0:
-; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps_1z74:
-; X64:       # %bb.0:
-; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps_1z74:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 1, i32 1, i32 7, i32 4>, i8 0)
   %res1 = shufflevector <4 x float> %res0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x float> %res1
 }
 
 define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps_02zu:
-; X32:       # %bb.0:
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps_02zu:
-; X64:       # %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps_02zu:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>, i8 0)
   ret <4 x float> %res0
 }
 
 define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps256_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps256_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps256_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
   %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
   ret <8 x float> %res1
 }
 
 define <8 x float> @combine_vpermil2ps256_08z945Az(<8 x float> %a0, <8 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps256_08z945Az:
-; X32:       # %bb.0:
-; X32-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps256_08z945Az:
-; X64:       # %bb.0:
-; X64-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps256_08z945Az:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 0, i32 1, i32 6, i32 7>, i8 0)
   %res1 = shufflevector <8 x float> %res0, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 8>
   ret <8 x float> %res1
 }
 
 define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps256_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps256_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps256_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>, i8 2)
   ret <8 x float> %res0
 }
 
 define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: combine_vpermil2ps_blend_with_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2ps_blend_with_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
   ret <4 x float> %res0
 }
 
 define <2 x double> @combine_vpermil2pd_as_shufpd(<2 x double> %a0, <2 x double> %a1) {
-; X32-LABEL: combine_vpermil2pd_as_shufpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2pd_as_shufpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2pd_as_shufpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 2, i64 4>, i8 0)
   ret <2 x double> %res0
 }
 
 define <4 x double> @combine_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x double> %a1) {
-; X32-LABEL: combine_vpermil2pd256_as_shufpd:
-; X32:       # %bb.0:
-; X32-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpermil2pd256_as_shufpd:
-; X64:       # %bb.0:
-; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpermil2pd256_as_shufpd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 4, i64 2, i64 7>, i8 0)
   ret <4 x double> %res0
 }
 
 define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_identity:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_identity:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
   %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
   ret <16 x i8> %res1
 }
 
 define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
   %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
   %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -225,10 +152,10 @@
 }
 
 define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_identity_bitcast:
-; X32:       # %bb.0:
-; X32-NEXT:    vpaddq {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: combine_vpperm_identity_bitcast:
+; X86:       # %bb.0:
+; X86-NEXT:    vpaddq {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpperm_identity_bitcast:
 ; X64:       # %bb.0:
@@ -244,73 +171,47 @@
 }
 
 define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_as_blend_with_zero:
-; X32:       # %bb.0:
-; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_as_blend_with_zero:
-; X64:       # %bb.0:
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_as_blend_with_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 0, i8 1, i8 128, i8 129, i8 4, i8 5, i8 6, i8 7, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137>)
   ret <16 x i8> %res0
 }
 
 define <16 x i8> @combine_vpperm_as_unary_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_as_unary_unpckhbw:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_as_unary_unpckhbw:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_as_unary_unpckhbw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
   ret <16 x i8> %res0
 }
 
 define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_as_unpckhbw:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_as_unpckhbw:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_as_unpckhbw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
   ret <16 x i8> %res0
 }
 
 define <16 x i8> @combine_vpperm_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
-; X32-LABEL: combine_vpperm_as_unpcklbw:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_as_unpcklbw:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_as_unpcklbw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 16, i8 0, i8 17, i8 1, i8 18, i8 2, i8 19, i8 3, i8 20, i8 4, i8 21, i8 5, i8 22, i8 6, i8 23, i8 7>)
   ret <16 x i8> %res0
 }
 
 define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
-; X32-LABEL: combine_vpperm_10zz32BA:
-; X32:       # %bb.0:
-; X32-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_vpperm_10zz32BA:
-; X64:       # %bb.0:
-; X64-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
-; X64-NEXT:    retq
+; CHECK-LABEL: combine_vpperm_10zz32BA:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res1 = bitcast <4 x i32> %res0 to <16 x i8>
   %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
@@ -320,34 +221,34 @@
 
 ; FIXME: Duplicated load in i686
 define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
-; X86AVX-LABEL: buildvector_v4f32_0404:
-; X86AVX:       # %bb.0:
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86AVX-NEXT:    vmovaps %xmm0, (%eax)
-; X86AVX-NEXT:    retl
+; X86-AVX-LABEL: buildvector_v4f32_0404:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
 ;
-; X86AVX2-LABEL: buildvector_v4f32_0404:
-; X86AVX2:       # %bb.0:
-; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86AVX2-NEXT:    vmovapd %xmm0, (%eax)
-; X86AVX2-NEXT:    retl
+; X86-AVX2-LABEL: buildvector_v4f32_0404:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX2-NEXT:    vmovapd %xmm0, (%eax)
+; X86-AVX2-NEXT:    retl
 ;
-; X64AVX-LABEL: buildvector_v4f32_0404:
-; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0]
-; X64AVX-NEXT:    vmovaps %xmm0, (%rdi)
-; X64AVX-NEXT:    retq
+; X64-AVX-LABEL: buildvector_v4f32_0404:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0]
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
 ;
-; X64AVX2-LABEL: buildvector_v4f32_0404:
-; X64AVX2:       # %bb.0:
-; X64AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64AVX2-NEXT:    vmovapd %xmm0, (%rdi)
-; X64AVX2-NEXT:    retq
+; X64-AVX2-LABEL: buildvector_v4f32_0404:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-AVX2-NEXT:    vmovapd %xmm0, (%rdi)
+; X64-AVX2-NEXT:    retq
   %v0 = insertelement <4 x float> undef, float %a, i32 0
   %v1 = insertelement <4 x float> %v0,   float %b, i32 1
   %v2 = insertelement <4 x float> %v1,   float %a, i32 2
@@ -357,13 +258,13 @@
 }
 
 define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr) {
-; X32-LABEL: buildvector_v4f32_07z6:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2]
-; X32-NEXT:    vmovaps %xmm0, (%eax)
-; X32-NEXT:    retl
+; X86-LABEL: buildvector_v4f32_07z6:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2]
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: buildvector_v4f32_07z6:
 ; X64:       # %bb.0:
@@ -381,82 +282,57 @@
 }
 
 define <2 x double> @constant_fold_vpermil2pd() {
-; X32-LABEL: constant_fold_vpermil2pd:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermil2pd:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermil2pd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
   ret <2 x double> %1
 }
 
 define <4 x double> @constant_fold_vpermil2pd_256() {
-; X32-LABEL: constant_fold_vpermil2pd_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermil2pd_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermil2pd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
   ret <4 x double> %1
 }
 
 define <4 x float> @constant_fold_vpermil2ps() {
-; X32-LABEL: constant_fold_vpermil2ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermil2ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermil2ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
   ret <4 x float> %1
 }
 
 define <8 x float> @constant_fold_vpermil2ps_256() {
-; X32-LABEL: constant_fold_vpermil2ps_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpermil2ps_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpermil2ps_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
   ret <8 x float> %1
 }
 
 define <16 x i8> @constant_fold_vpperm() {
-; X32-LABEL: constant_fold_vpperm:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; X32-NEXT:    retl
-;
-; X64-LABEL: constant_fold_vpperm:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; X64-NEXT:    retq
+; CHECK-LABEL: constant_fold_vpperm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
   ret <16 x i8> %1
 }
 
 define <4 x float> @PR31296(i8* %in) {
-; X32-LABEL: PR31296:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: PR31296:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR31296:
 ; X64:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 199a05e..61d3fc3 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST
 ;
 ; Verify that the DAG combiner correctly folds bitwise operations across
 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
@@ -18,9 +18,9 @@
 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
 
 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
-; ALL-LABEL: combine_pshufd1:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufd1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
@@ -28,9 +28,9 @@
 }
 
 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
-; ALL-LABEL: combine_pshufd2:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufd2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
@@ -41,9 +41,9 @@
 }
 
 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
-; ALL-LABEL: combine_pshufd3:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshufd3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
@@ -113,9 +113,9 @@
 }
 
 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
-; ALL-LABEL: combine_pshuflw1:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshuflw1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
@@ -123,9 +123,9 @@
 }
 
 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
-; ALL-LABEL: combine_pshuflw2:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_pshuflw2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    retq
 entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
@@ -811,9 +811,9 @@
 
 ; The following pair of shuffles is folded into vector %A.
 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
-; ALL-LABEL: combine_nested_undef_test13:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_nested_undef_test13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
   ret <4 x i32> %2
@@ -1371,9 +1371,9 @@
 }
 
 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
-; ALL-LABEL: combine_test11:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_test11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   ret <4 x float> %2
@@ -1464,9 +1464,9 @@
 }
 
 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
-; ALL-LABEL: combine_test16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_test16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   ret <4 x i32> %2
@@ -2161,9 +2161,9 @@
 ;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
 
 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
-; ALL-LABEL: combine_undef_input_test6:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_undef_input_test6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
   ret <4 x float> %2
@@ -2235,9 +2235,9 @@
 }
 
 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
-; ALL-LABEL: combine_undef_input_test10:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_undef_input_test10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
   ret <4 x float> %2
@@ -2351,9 +2351,9 @@
 ; combined into a single legal shuffle operation.
 
 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
-; ALL-LABEL: combine_undef_input_test16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_undef_input_test16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   ret <4 x float> %2
@@ -2425,9 +2425,9 @@
 }
 
 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
-; ALL-LABEL: combine_undef_input_test20:
-; ALL:       # %bb.0:
-; ALL-NEXT:    retq
+; CHECK-LABEL: combine_undef_input_test20:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
   %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
   ret <4 x float> %2