| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s |
| |
| target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" |
| target triple = "x86_64-unknown-unknown" |
| |
| ; Stack reload folding tests. |
| ; |
| ; By including a nop call with sideeffects we can force a partial register spill of the |
| ; relevant registers and check that the reload is correctly folded into the instruction. |
| |
| define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { |
| ; CHECK-LABEL: stack_fold_broadcastsd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer |
| ; fadd forces execution domain |
| %3 = fadd <4 x double> %2, <double 0x1, double 0x0, double 0x0, double 0x0> |
| ret <4 x double> %3 |
| } |
| |
| define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { |
| ; CHECK-LABEL: stack_fold_broadcastss: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer |
| ; fadd forces execution domain |
| %3 = fadd <4 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0> |
| ret <4 x float> %3 |
| } |
| |
| define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { |
| ; CHECK-LABEL: stack_fold_broadcastss_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer |
| ; fadd forces execution domain |
| %3 = fadd <8 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> |
| ret <8 x float> %3 |
| } |
| |
| define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_extracti128: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| ; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: retq |
| ; zext forces execution domain |
| %t1 = zext <8 x i16> %a0 to <8 x i32> |
| %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| ret <4 x i32> %t2 |
| } |
| |
| define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_inserti128: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_mpsadbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone |
| |
| define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pabsb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <32 x i8> %a0, zeroinitializer |
| %3 = sub <32 x i8> zeroinitializer, %a0 |
| %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3 |
| ret <32 x i8> %4 |
| } |
| |
| define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pabsd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <8 x i32> %a0, zeroinitializer |
| %3 = sub <8 x i32> zeroinitializer, %a0 |
| %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3 |
| ret <8 x i32> %4 |
| } |
| |
| define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pabsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <16 x i16> %a0, zeroinitializer |
| %3 = sub <16 x i16> zeroinitializer, %a0 |
| %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3 |
| ret <16 x i16> %4 |
| } |
| |
| define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_packssdw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_packsswb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_packusdw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_packuswb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_paddb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = add <32 x i8> %a0, %a1 |
| ret <32 x i8> %2 |
| } |
| |
| define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_paddd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = add <8 x i32> %a0, %a1 |
| ret <8 x i32> %2 |
| } |
| |
| define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_paddq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = add <4 x i64> %a0, %a1 |
| ret <4 x i64> %2 |
| } |
| |
| define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_paddsb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_paddsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_paddusb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_paddusw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_paddw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = add <16 x i16> %a0, %a1 |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_palignr: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> |
| ret <32 x i8> %2 |
| } |
| |
| define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pand: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = and <32 x i8> %a0, %a1 |
| ; add forces execution domain |
| %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| ret <32 x i8> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pandn: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpandn {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> |
| %3 = and <32 x i8> %2, %a1 |
| ; add forces execution domain |
| %4 = add <32 x i8> %3, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| ret <32 x i8> %4 |
| } |
| |
| define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pavgb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = zext <32 x i8> %a0 to <32 x i16> |
| %3 = zext <32 x i8> %a1 to <32 x i16> |
| %4 = add <32 x i16> %2, %3 |
| %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> |
| %7 = trunc <32 x i16> %6 to <32 x i8> |
| ret <32 x i8> %7 |
| } |
| |
| define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pavgw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = zext <16 x i16> %a0 to <16 x i32> |
| %3 = zext <16 x i16> %a1 to <16 x i32> |
| %4 = add <16 x i32> %2, %3 |
| %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| %7 = trunc <16 x i32> %6 to <16 x i16> |
| ret <16 x i16> %7 |
| } |
| |
| define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pblendd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3] |
| ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> |
| ; add forces execution domain |
| %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> |
| ret <4 x i32> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pblendd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7] |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) { |
| ; CHECK-LABEL: stack_fold_pblendvb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pblendw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7],mem[8,9,10],ymm0[11,12,13,14,15] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15> |
| ret <16 x i16> %2 |
| } |
| |
| define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer |
| ret <16 x i8> %2 |
| } |
| |
| define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastb_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer |
| ret <32 x i8> %2 |
| } |
| |
| define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer |
| ; add forces execution domain |
| %3 = add <4 x i32> %2, <i32 2, i32 1, i32 1, i32 1> |
| ret <4 x i32> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer |
| ; add forces execution domain |
| %3 = add <2 x i64> %2, <i64 2, i64 1> |
| ret <2 x i64> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastq_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer |
| ; add forces execution domain |
| %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> |
| ret <4 x i64> %3 |
| } |
| |
| define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer |
| ret <8 x i16> %2 |
| } |
| |
| define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pbroadcastw_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpeqb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp eq <32 x i8> %a0, %a1 |
| %3 = sext <32 x i1> %2 to <32 x i8> |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpeqd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp eq <8 x i32> %a0, %a1 |
| %3 = sext <8 x i1> %2 to <8 x i32> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpeqq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp eq <4 x i64> %a0, %a1 |
| %3 = sext <4 x i1> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpeqw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp eq <16 x i16> %a0, %a1 |
| %3 = sext <16 x i1> %2 to <16 x i16> |
| ret <16 x i16> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpgtb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <32 x i8> %a0, %a1 |
| %3 = sext <32 x i1> %2 to <32 x i8> |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpgtd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <8 x i32> %a0, %a1 |
| %3 = sext <8 x i1> %2 to <8 x i32> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpgtq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <4 x i64> %a0, %a1 |
| %3 = sext <4 x i1> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pcmpgtw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <16 x i16> %a0, %a1 |
| %3 = sext <16 x i1> %2 to <16 x i16> |
| ret <16 x i16> %3 |
| } |
| |
| define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_perm2i128: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vperm2i128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[2,3],mem[0,1] |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_permd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly |
| |
| define <4 x double> @stack_fold_permpd(<4 x double> %a0) { |
| ; CHECK-LABEL: stack_fold_permpd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[3,2,2,3] |
| ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 |
| ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> |
| ; fadd forces execution domain |
| %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> |
| ret <4 x double> %3 |
| } |
| |
| define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_permps: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) |
| ret <8 x float> %2 |
| } |
| declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly |
| |
| define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { |
| ; CHECK-LABEL: stack_fold_permq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[3,2,2,3] |
| ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> |
| ; add forces execution domain |
| %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> |
| ret <4 x i64> %3 |
| } |
| |
| define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_phaddd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_phaddsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_phaddw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_phsubd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_phsubsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_phsubw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaddubsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaddwd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxsb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <32 x i8> %a0, %a1 |
| %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxsd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <8 x i32> %a0, %a1 |
| %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp sgt <16 x i16> %a0, %a1 |
| %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 |
| ret <16 x i16> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxub: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ugt <32 x i8> %a0, %a1 |
| %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxud: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ugt <8 x i32> %a0, %a1 |
| %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmaxuw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ugt <16 x i16> %a0, %a1 |
| %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 |
| ret <16 x i16> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pminsb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp slt <32 x i8> %a0, %a1 |
| %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pminsd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp slt <8 x i32> %a0, %a1 |
| %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pminsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp slt <16 x i16> %a0, %a1 |
| %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 |
| ret <16 x i16> %3 |
| } |
| |
| define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pminub: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ult <32 x i8> %a0, %a1 |
| %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 |
| ret <32 x i8> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pminud: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ult <8 x i32> %a0, %a1 |
| %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pminuw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = icmp ult <16 x i16> %a0, %a1 |
| %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 |
| ret <16 x i16> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxbd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %3 = sext <8 x i8> %2 to <8 x i32> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxbq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %3 = sext <4 x i8> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sext <16 x i8> %a0 to <16 x i16> |
| ret <16 x i16> %2 |
| } |
| |
| define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxdq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sext <4 x i32> %a0 to <4 x i64> |
| ret <4 x i64> %2 |
| } |
| |
| define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxwd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sext <8 x i16> %a0 to <8 x i32> |
| ret <8 x i32> %2 |
| } |
| |
| define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovsxwq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %3 = sext <4 x i16> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxbd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| %3 = zext <8 x i8> %2 to <8 x i32> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxbq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %3 = zext <4 x i8> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = zext <16 x i8> %a0 to <16 x i16> |
| ret <16 x i16> %2 |
| } |
| |
| define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxdq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = zext <4 x i32> %a0 to <4 x i64> |
| ret <4 x i64> %2 |
| } |
| |
| define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxwd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = zext <8 x i16> %a0 to <8 x i32> |
| ret <8 x i32> %2 |
| } |
| |
| define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_pmovzxwq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %3 = zext <4 x i16> %2 to <4 x i64> |
| ret <4 x i64> %3 |
| } |
| |
| define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pmuldq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = bitcast <8 x i32> %a0 to <4 x i64> |
| %3 = bitcast <8 x i32> %a1 to <4 x i64> |
| %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32> |
| %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> |
| %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32> |
| %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> |
| %8 = mul <4 x i64> %5, %7 |
| ret <4 x i64> %8 |
| } |
| |
| define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmulhrsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmulhuw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmulhw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pmulld: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = mul <8 x i32> %a0, %a1 |
| ret <8 x i32> %2 |
| } |
| |
| define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_pmullw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = mul <16 x i16> %a0, %a1 |
| ret <16 x i16> %2 |
| } |
| |
| define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pmuludq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = bitcast <8 x i32> %a0 to <4 x i64> |
| %3 = bitcast <8 x i32> %a1 to <4 x i64> |
| %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> |
| %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> |
| %6 = mul <4 x i64> %4, %5 |
| ret <4 x i64> %6 |
| } |
| |
| define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_por: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = or <32 x i8> %a0, %a1 |
| ; add forces execution domain |
| %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| ret <32 x i8> %3 |
| } |
| |
| define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_psadbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) |
| ret <4 x i64> %2 |
| } |
| declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pshufb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) { |
| ; CHECK-LABEL: stack_fold_pshufd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_vpshufhw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> |
| ret <16 x i16> %2 |
| } |
| |
| define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) { |
| ; CHECK-LABEL: stack_fold_vpshuflw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_psignb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psignd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psignw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_pslld: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone |
| |
| define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psllq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) |
| ret <4 x i64> %2 |
| } |
| declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone |
| |
| define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psllvd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) |
| ret <4 x i32> %2 |
| } |
| declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psllvd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psllvq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) |
| ret <2 x i64> %2 |
| } |
| declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone |
| |
| define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psllvq_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) |
| ret <4 x i64> %2 |
| } |
| declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psllw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psrad: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone |
| |
| define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psravd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) |
| ret <4 x i32> %2 |
| } |
| declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psravd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psraw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psrld: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone |
| |
| define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) |
| ret <4 x i64> %2 |
| } |
| declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone |
| |
| define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlvd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) |
| ret <4 x i32> %2 |
| } |
| declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone |
| |
| define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlvd_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) |
| ret <8 x i32> %2 |
| } |
| declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone |
| |
| define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlvq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) |
| ret <2 x i64> %2 |
| } |
| declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone |
| |
| define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlvq_ymm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) |
| ret <4 x i64> %2 |
| } |
| declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psrlw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_psubb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sub <32 x i8> %a0, %a1 |
| ret <32 x i8> %2 |
| } |
| |
| define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_psubd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sub <8 x i32> %a0, %a1 |
| ret <8 x i32> %2 |
| } |
| |
| define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_psubq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sub <4 x i64> %a0, %a1 |
| ret <4 x i64> %2 |
| } |
| |
| define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_psubsb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psubsw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_psubusb: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) |
| ret <32 x i8> %2 |
| } |
| declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psubusw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) |
| ret <16 x i16> %2 |
| } |
| declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone |
| |
| define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_psubw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = sub <16 x i16> %a0, %a1 |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_punpckhbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> |
| ret <32 x i8> %2 |
| } |
| |
| define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_punpckhdq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_punpckhqdq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] |
| ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> |
| ; add forces execution domain |
| %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_punpckhwd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_punpcklbw: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> |
| ret <32 x i8> %2 |
| } |
| |
| define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) { |
| ; CHECK-LABEL: stack_fold_punpckldq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] |
| ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> |
| ; add forces execution domain |
| %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> |
| ret <8 x i32> %3 |
| } |
| |
| define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) { |
| ; CHECK-LABEL: stack_fold_punpcklqdq: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] |
| ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> |
| ; add forces execution domain |
| %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> |
| ret <4 x i64> %3 |
| } |
| |
| define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) { |
| ; CHECK-LABEL: stack_fold_punpcklwd: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> |
| ret <16 x i16> %2 |
| } |
| |
| define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) { |
| ; CHECK-LABEL: stack_fold_pxor: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vpxor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload |
| ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() |
| %2 = xor <32 x i8> %a0, %a1 |
| ; add forces execution domain |
| %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> |
| ret <32 x i8> %3 |
| } |