|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s | 
|  |  | 
|  | target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" | 
|  | target triple = "x86_64-unknown-unknown" | 
|  |  | 
|  | ; Stack reload folding tests. | 
|  | ; | 
|  | ; By including a nop call with sideeffects we can force a partial register spill of the | 
|  | ; relevant registers and check that the reload is correctly folded into the instruction. | 
|  |  | 
|  | define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_broadcastsd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero | 
|  | ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer | 
|  | ; fadd forces execution domain | 
|  | %3 = fadd <4 x double> %2, <double 0x1, double 0x0, double 0x0, double 0x0> | 
|  | ret <4 x double> %3 | 
|  | } | 
|  |  | 
|  | define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_broadcastss: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero | 
|  | ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer | 
|  | ; fadd forces execution domain | 
|  | %3 = fadd <4 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0> | 
|  | ret <4 x float> %3 | 
|  | } | 
|  |  | 
|  | define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_broadcastss_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero | 
|  | ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer | 
|  | ; fadd forces execution domain | 
|  | %3 = fadd <8 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> | 
|  | ret <8 x float> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_extracti128: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero | 
|  | ; CHECK-NEXT:    vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload | 
|  | ; CHECK-NEXT:    vzeroupper | 
|  | ; CHECK-NEXT:    retq | 
|  | ; zext forces execution domain | 
|  | %t1 = zext <8 x i16> %a0 to <8 x i32> | 
|  | %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | ret <4 x i32> %t2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_inserti128: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0 | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_mpsadbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pabsb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <32 x i8> %a0, zeroinitializer | 
|  | %3 = sub <32 x i8> zeroinitializer, %a0 | 
|  | %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3 | 
|  | ret <32 x i8> %4 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pabsd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <8 x i32> %a0, zeroinitializer | 
|  | %3 = sub <8 x i32> zeroinitializer, %a0 | 
|  | %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3 | 
|  | ret <8 x i32> %4 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pabsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <16 x i16> %a0, zeroinitializer | 
|  | %3 = sub <16 x i16> zeroinitializer, %a0 | 
|  | %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3 | 
|  | ret <16 x i16> %4 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_packssdw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_packsswb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_packusdw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_packuswb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = add <32 x i8> %a0, %a1 | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = add <8 x i32> %a0, %a1 | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = add <4 x i64> %a0, %a1 | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddsb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddusb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddusw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_paddw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = add <16 x i16> %a0, %a1 | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_palignr: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pand: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpand {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = and <32 x i8> %a0, %a1 | 
|  | ; add forces execution domain | 
|  | %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pandn: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpandn {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> | 
|  | %3 = and <32 x i8> %2, %a1 | 
|  | ; add forces execution domain | 
|  | %4 = add <32 x i8> %3, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> | 
|  | ret <32 x i8> %4 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pavgb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = zext <32 x i8> %a0 to <32 x i16> | 
|  | %3 = zext <32 x i8> %a1 to <32 x i16> | 
|  | %4 = add <32 x i16> %2, %3 | 
|  | %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> | 
|  | %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> | 
|  | %7 = trunc <32 x i16> %6 to <32 x i8> | 
|  | ret <32 x i8> %7 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pavgw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = zext <16 x i16> %a0 to <16 x i32> | 
|  | %3 = zext <16 x i16> %a1 to <16 x i32> | 
|  | %4 = add <16 x i32> %2, %3 | 
|  | %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | %7 = trunc <16 x i32> %6 to <16 x i16> | 
|  | ret <16 x i16> %7 | 
|  | } | 
|  |  | 
|  | define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pblendd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # xmm0 = mem[0,1,2],xmm0[3] | 
|  | ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 | 
|  | ; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> | 
|  | ret <4 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pblendd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7] | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) { | 
|  | ; CHECK-LABEL: stack_fold_pblendvb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pblendw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7],mem[8,9,10],ymm0[11,12,13,14,15] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer | 
|  | ret <16 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastb_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i32> %2, <i32 2, i32 1, i32 1, i32 1> | 
|  | ret <4 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer | 
|  | ; add forces execution domain | 
|  | %3 = add <2 x i64> %2, <i64 2, i64 1> | 
|  | ret <2 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastq_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer | 
|  | ret <8 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pbroadcastw_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpeqb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp eq <32 x i8> %a0, %a1 | 
|  | %3 = sext <32 x i1> %2 to <32 x i8> | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpeqd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp eq <8 x i32> %a0, %a1 | 
|  | %3 = sext <8 x i1> %2 to <8 x i32> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpeqq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp eq <4 x i64> %a0, %a1 | 
|  | %3 = sext <4 x i1> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpeqw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp eq <16 x i16> %a0, %a1 | 
|  | %3 = sext <16 x i1> %2 to <16 x i16> | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpgtb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <32 x i8> %a0, %a1 | 
|  | %3 = sext <32 x i1> %2 to <32 x i8> | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpgtd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <8 x i32> %a0, %a1 | 
|  | %3 = sext <8 x i1> %2 to <8 x i32> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpgtq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <4 x i64> %a0, %a1 | 
|  | %3 = sext <4 x i1> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pcmpgtw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <16 x i16> %a0, %a1 | 
|  | %3 = sext <16 x i1> %2 to <16 x i16> | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_perm2i128: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vperm2i128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[2,3],mem[0,1] | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_permd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly | 
|  |  | 
|  | define <4 x double> @stack_fold_permpd(<4 x double> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_permpd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[3,2,2,3] | 
|  | ; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 | 
|  | ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> | 
|  | ; fadd forces execution domain | 
|  | %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> | 
|  | ret <4 x double> %3 | 
|  | } | 
|  |  | 
|  | define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_permps: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) | 
|  | ret <8 x float> %2 | 
|  | } | 
|  | declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly | 
|  |  | 
|  | define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_permq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[3,2,2,3] | 
|  | ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phaddd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phaddsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phaddw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phsubd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phsubsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_phsubw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaddubsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaddwd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxsb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <32 x i8> %a0, %a1 | 
|  | %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxsd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <8 x i32> %a0, %a1 | 
|  | %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp sgt <16 x i16> %a0, %a1 | 
|  | %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxub: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ugt <32 x i8> %a0, %a1 | 
|  | %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxud: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ugt <8 x i32> %a0, %a1 | 
|  | %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmaxuw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ugt <16 x i16> %a0, %a1 | 
|  | %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminsb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp slt <32 x i8> %a0, %a1 | 
|  | %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminsd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp slt <8 x i32> %a0, %a1 | 
|  | %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp slt <16 x i16> %a0, %a1 | 
|  | %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminub: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ult <32 x i8> %a0, %a1 | 
|  | %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminud: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ult <8 x i32> %a0, %a1 | 
|  | %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pminuw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = icmp ult <16 x i16> %a0, %a1 | 
|  | %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 | 
|  | ret <16 x i16> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxbd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 
|  | %3 = sext <8 x i8> %2 to <8 x i32> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxbq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %3 = sext <4 x i8> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sext <16 x i8> %a0 to <16 x i16> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxdq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sext <4 x i32> %a0 to <4 x i64> | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxwd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sext <8 x i16> %a0 to <8 x i32> | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovsxwq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %3 = sext <4 x i16> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxbd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> | 
|  | %3 = zext <8 x i8> %2 to <8 x i32> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxbq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %3 = zext <4 x i8> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = zext <16 x i8> %a0 to <16 x i16> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxdq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = zext <4 x i32> %a0 to <4 x i64> | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxwd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = zext <8 x i16> %a0 to <8 x i32> | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pmovzxwq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %3 = zext <4 x i16> %2 to <4 x i64> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmuldq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = bitcast <8 x i32> %a0 to <4 x i64> | 
|  | %3 = bitcast <8 x i32> %a1 to <4 x i64> | 
|  | %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32> | 
|  | %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> | 
|  | %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32> | 
|  | %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> | 
|  | %8 = mul <4 x i64> %5, %7 | 
|  | ret <4 x i64> %8 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmulhrsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmulhuw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmulhw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmulld: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = mul <8 x i32> %a0, %a1 | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmullw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = mul <16 x i16> %a0, %a1 | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pmuludq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = bitcast <8 x i32> %a0 to <4 x i64> | 
|  | %3 = bitcast <8 x i32> %a1 to <4 x i64> | 
|  | %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> | 
|  | %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> | 
|  | %6 = mul <4 x i64> %4, %5 | 
|  | ret <4 x i64> %6 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_por: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = or <32 x i8> %a0, %a1 | 
|  | ; add forces execution domain | 
|  | %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> | 
|  | ret <32 x i8> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psadbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  | declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pshufb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_pshufd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,7,6,5,4] | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_vpshufhw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) { | 
|  | ; CHECK-LABEL: stack_fold_vpshuflw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psignb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psignd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psignw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pslld: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  | declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone | 
|  |  | 
|  | define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllvd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) | 
|  | ret <4 x i32> %2 | 
|  | } | 
|  | declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllvd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllvq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) | 
|  | ret <2 x i64> %2 | 
|  | } | 
|  | declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone | 
|  |  | 
|  | define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllvq_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  | declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psllw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrad: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psravd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) | 
|  | ret <4 x i32> %2 | 
|  | } | 
|  | declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psravd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psraw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrld: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  | declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone | 
|  |  | 
|  | define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlvd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) | 
|  | ret <4 x i32> %2 | 
|  | } | 
|  | declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone | 
|  |  | 
|  | define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlvd_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  | declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone | 
|  |  | 
|  | define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlvq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) | 
|  | ret <2 x i64> %2 | 
|  | } | 
|  | declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone | 
|  |  | 
|  | define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlvq_ymm: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  | declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psrlw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sub <32 x i8> %a0, %a1 | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sub <8 x i32> %a0, %a1 | 
|  | ret <8 x i32> %2 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sub <4 x i64> %a0, %a1 | 
|  | ret <4 x i64> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubsb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubsw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubusb: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  | declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubusw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  | declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone | 
|  |  | 
|  | define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_psubw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = sub <16 x i16> %a0, %a1 | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpckhbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpckhdq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpckhqdq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] | 
|  | ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpckhwd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpcklbw: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> | 
|  | ret <32 x i8> %2 | 
|  | } | 
|  |  | 
|  | define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpckldq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] | 
|  | ; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> | 
|  | ; add forces execution domain | 
|  | %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> | 
|  | ret <8 x i32> %3 | 
|  | } | 
|  |  | 
|  | define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpcklqdq: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] | 
|  | ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> | 
|  | ; add forces execution domain | 
|  | %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> | 
|  | ret <4 x i64> %3 | 
|  | } | 
|  |  | 
|  | define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_punpcklwd: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> | 
|  | ret <16 x i16> %2 | 
|  | } | 
|  |  | 
|  | define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) { | 
|  | ; CHECK-LABEL: stack_fold_pxor: | 
|  | ; CHECK:       # %bb.0: | 
|  | ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill | 
|  | ; CHECK-NEXT:    #APP | 
|  | ; CHECK-NEXT:    nop | 
|  | ; CHECK-NEXT:    #NO_APP | 
|  | ; CHECK-NEXT:    vpxor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload | 
|  | ; CHECK-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 | 
|  | ; CHECK-NEXT:    retq | 
|  | %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() | 
|  | %2 = xor <32 x i8> %a0, %a1 | 
|  | ; add forces execution domain | 
|  | %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> | 
|  | ret <32 x i8> %3 | 
|  | } |