test/Transforms/InstCombine/x86-vperm2.ll - llvm - Git at Google

 ; RUN: opt < %s -instcombine -S | FileCheck %s

 ; This should never happen, but make sure we don't crash handling a non-constant immediate byte.

 define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_non_const_imm
 ; CHECK-NEXT:  call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
 ; CHECK-NEXT:  ret <4 x double>
 }


 ; In the following 4 tests, both zero mask bits of the immediate are set.

 define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x88
 ; CHECK-NEXT:  ret <4 x double> zeroinitializer
 }

 define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
   %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
   ret <8 x float> %res

 ; CHECK-LABEL: @perm2ps_0x88
 ; CHECK-NEXT:  ret <8 x float> zeroinitializer
 }

 define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
   %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
   ret <8 x i32> %res

 ; CHECK-LABEL: @perm2si_0x88
 ; CHECK-NEXT:  ret <8 x i32> zeroinitializer
 }

 define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
   ret <4 x i64> %res

 ; CHECK-LABEL: @perm2i_0x88
 ; CHECK-NEXT:  ret <4 x i64> zeroinitializer
 }


 ; The other control bits are ignored when zero mask bits of the immediate are set.

 define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0xff
 ; CHECK-NEXT:  ret <4 x double> zeroinitializer
 }


 ; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
 ; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..

 define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x00
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x01
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x02
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x03
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x10
 ; CHECK-NEXT:  ret <4 x double> %a0
 }

 define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x11
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x12
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x13
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x20
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x21
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x22
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x23
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x30
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x31
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x32
 ; CHECK-NEXT:  ret <4 x double> %a1
 }

 define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x33
 ; CHECK-NEXT:  %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x double> %1
 }

 ; Confirm that a mask for 32-bit elements is also correct.

 define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
   %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
   ret <8 x float> %res

 ; CHECK-LABEL: @perm2ps_0x31
 ; CHECK-NEXT:  %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  ret <8 x float> %1
 }


 ; Confirm that the AVX2 version works the same.

 define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
   ret <4 x i64> %res

 ; CHECK-LABEL: @perm2i_0x33
 ; CHECK-NEXT:  %1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x i64> %1
 }


 ; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.

 define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x81
 ; CHECK-NEXT:  shufflevector <4 x double> %a0, <4 x double> <double 0.0{{.*}}<4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }

 define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x83
 ; CHECK-NEXT:  shufflevector <4 x double> %a1, <4 x double> <double 0.0{{.*}}, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }

 define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x28
 ; CHECK-NEXT:  shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }

 define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
   ret <4 x double> %res

 ; CHECK-LABEL: @perm2pd_0x08
 ; CHECK-NEXT:  shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x double>
 }

 ; Check one more with the AVX2 version.

 define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
   ret <4 x i64> %res

 ; CHECK-LABEL: @perm2i_0x28
 ; CHECK-NEXT:  shufflevector <4 x i64> <i64 0{{.*}}, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:  ret <4 x i64>
 }

 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
	; RUN: opt < %s -instcombine -S \| FileCheck %s

	; This should never happen, but make sure we don't crash handling a non-constant immediate byte.

	define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_non_const_imm
	; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
	; CHECK-NEXT: ret <4 x double>
	}


	; In the following 4 tests, both zero mask bits of the immediate are set.

	define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x88
	; CHECK-NEXT: ret <4 x double> zeroinitializer
	}

	define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
	%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
	ret <8 x float> %res

	; CHECK-LABEL: @perm2ps_0x88
	; CHECK-NEXT: ret <8 x float> zeroinitializer
	}

	define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
	%res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
	ret <8 x i32> %res

	; CHECK-LABEL: @perm2si_0x88
	; CHECK-NEXT: ret <8 x i32> zeroinitializer
	}

	define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
	%res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
	ret <4 x i64> %res

	; CHECK-LABEL: @perm2i_0x88
	; CHECK-NEXT: ret <4 x i64> zeroinitializer
	}


	; The other control bits are ignored when zero mask bits of the immediate are set.

	define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0xff
	; CHECK-NEXT: ret <4 x double> zeroinitializer
	}


	; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
	; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..

	define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x00
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x01
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x02
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x03
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x10
	; CHECK-NEXT: ret <4 x double> %a0
	}

	define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x11
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x12
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x13
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x20
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x21
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x22
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x23
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x30
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x31
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
	; CHECK-NEXT: ret <4 x double> %1
	}

	define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x32
	; CHECK-NEXT: ret <4 x double> %a1
	}

	define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x33
	; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
	; CHECK-NEXT: ret <4 x double> %1
	}

	; Confirm that a mask for 32-bit elements is also correct.

	define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
	%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
	ret <8 x float> %res

	; CHECK-LABEL: @perm2ps_0x31
	; CHECK-NEXT: %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
	; CHECK-NEXT: ret <8 x float> %1
	}


	; Confirm that the AVX2 version works the same.

	define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
	%res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
	ret <4 x i64> %res

	; CHECK-LABEL: @perm2i_0x33
	; CHECK-NEXT: %1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
	; CHECK-NEXT: ret <4 x i64> %1
	}


	; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.

	define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x81
	; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> <double 0.0{{.*}}<4 x i32> <i32 2, i32 3, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double>
	}

	define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x83
	; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x double> <double 0.0{{.*}}, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double>
	}

	define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x28
	; CHECK-NEXT: shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double>
	}

	define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
	%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
	ret <4 x double> %res

	; CHECK-LABEL: @perm2pd_0x08
	; CHECK-NEXT: shufflevector <4 x double> <double 0.0{{.*}}, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x double>
	}

	; Check one more with the AVX2 version.

	define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
	%res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
	ret <4 x i64> %res

	; CHECK-LABEL: @perm2i_0x28
	; CHECK-NEXT: shufflevector <4 x i64> <i64 0{{.*}}, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	; CHECK-NEXT: ret <4 x i64>
	}

	declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
	declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
	declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
	declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone