| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-LINUX |
| ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-WIN |
| ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefixes=AVX,AVX1 |
| ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512VL |
| |
| define dso_local double @t1(float* nocapture %x) nounwind readonly ssp { |
| ; SSE-LABEL: t1: |
| ; SSE: # %bb.0: # %entry |
| ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; SSE-NEXT: cvtss2sd %xmm0, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: t1: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| |
| %0 = load float, float* %x, align 4 |
| %1 = fpext float %0 to double |
| ret double %1 |
| } |
| |
| define dso_local float @t2(double* nocapture %x) nounwind readonly ssp optsize { |
| ; SSE-LINUX-LABEL: t2: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: cvtsd2ss (%rdi), %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: t2: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: cvtsd2ss (%rcx), %xmm0 |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX-LABEL: t2: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vcvtsd2ss (%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| %0 = load double, double* %x, align 8 |
| %1 = fptrunc double %0 to float |
| ret float %1 |
| } |
| |
| define dso_local float @squirtf(float* %x) nounwind { |
| ; SSE-LABEL: squirtf: |
| ; SSE: # %bb.0: # %entry |
| ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; SSE-NEXT: sqrtss %xmm0, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: squirtf: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| %z = load float, float* %x |
| %t = call float @llvm.sqrt.f32(float %z) |
| ret float %t |
| } |
| |
| define dso_local double @squirt(double* %x) nounwind { |
| ; SSE-LABEL: squirt: |
| ; SSE: # %bb.0: # %entry |
| ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero |
| ; SSE-NEXT: sqrtsd %xmm0, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: squirt: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| %z = load double, double* %x |
| %t = call double @llvm.sqrt.f64(double %z) |
| ret double %t |
| } |
| |
| define dso_local float @squirtf_size(float* %x) nounwind optsize { |
| ; SSE-LINUX-LABEL: squirtf_size: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: sqrtss (%rdi), %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: squirtf_size: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: sqrtss (%rcx), %xmm0 |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX-LABEL: squirtf_size: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vsqrtss (%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| %z = load float, float* %x |
| %t = call float @llvm.sqrt.f32(float %z) |
| ret float %t |
| } |
| |
| define dso_local double @squirt_size(double* %x) nounwind optsize { |
| ; SSE-LINUX-LABEL: squirt_size: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: sqrtsd (%rdi), %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: squirt_size: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: sqrtsd (%rcx), %xmm0 |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX-LABEL: squirt_size: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: vsqrtsd (%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| entry: |
| %z = load double, double* %x |
| %t = call double @llvm.sqrt.f64(double %z) |
| ret double %t |
| } |
| |
| declare float @llvm.sqrt.f32(float) |
| declare double @llvm.sqrt.f64(double) |
| |
| ; This loop contains two cvtsi2ss instructions that update the same xmm |
| ; register. Verify that the break false dependency fix pass breaks those |
| ; dependencies by inserting xorps instructions. |
| ; |
| define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp { |
| ; SSE-LINUX-LABEL: loopdep1: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: testl %edi, %edi |
| ; SSE-LINUX-NEXT: je .LBB6_1 |
| ; SSE-LINUX-NEXT: # %bb.2: # %for.body.preheader |
| ; SSE-LINUX-NEXT: movl $1, %eax |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: xorps %xmm1, %xmm1 |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB6_3: # %for.body |
| ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-LINUX-NEXT: xorps %xmm2, %xmm2 |
| ; SSE-LINUX-NEXT: cvtsi2ss %eax, %xmm2 |
| ; SSE-LINUX-NEXT: xorps %xmm3, %xmm3 |
| ; SSE-LINUX-NEXT: cvtsi2ss %edi, %xmm3 |
| ; SSE-LINUX-NEXT: addss %xmm2, %xmm0 |
| ; SSE-LINUX-NEXT: addss %xmm3, %xmm1 |
| ; SSE-LINUX-NEXT: incl %eax |
| ; SSE-LINUX-NEXT: decl %edi |
| ; SSE-LINUX-NEXT: jne .LBB6_3 |
| ; SSE-LINUX-NEXT: # %bb.4: # %for.end |
| ; SSE-LINUX-NEXT: subss %xmm1, %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; SSE-LINUX-NEXT: .LBB6_1: |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: xorps %xmm1, %xmm1 |
| ; SSE-LINUX-NEXT: subss %xmm1, %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: loopdep1: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: testl %ecx, %ecx |
| ; SSE-WIN-NEXT: je .LBB6_1 |
| ; SSE-WIN-NEXT: # %bb.2: # %for.body.preheader |
| ; SSE-WIN-NEXT: movl $1, %eax |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: xorps %xmm1, %xmm1 |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB6_3: # %for.body |
| ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-WIN-NEXT: xorps %xmm2, %xmm2 |
| ; SSE-WIN-NEXT: cvtsi2ss %eax, %xmm2 |
| ; SSE-WIN-NEXT: xorps %xmm3, %xmm3 |
| ; SSE-WIN-NEXT: cvtsi2ss %ecx, %xmm3 |
| ; SSE-WIN-NEXT: addss %xmm2, %xmm0 |
| ; SSE-WIN-NEXT: addss %xmm3, %xmm1 |
| ; SSE-WIN-NEXT: incl %eax |
| ; SSE-WIN-NEXT: decl %ecx |
| ; SSE-WIN-NEXT: jne .LBB6_3 |
| ; SSE-WIN-NEXT: # %bb.4: # %for.end |
| ; SSE-WIN-NEXT: subss %xmm1, %xmm0 |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .LBB6_1: |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: xorps %xmm1, %xmm1 |
| ; SSE-WIN-NEXT: subss %xmm1, %xmm0 |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX1-LABEL: loopdep1: |
| ; AVX1: # %bb.0: # %entry |
| ; AVX1-NEXT: testl %ecx, %ecx |
| ; AVX1-NEXT: je .LBB6_1 |
| ; AVX1-NEXT: # %bb.2: # %for.body.preheader |
| ; AVX1-NEXT: movl $1, %eax |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: .p2align 4, 0x90 |
| ; AVX1-NEXT: .LBB6_3: # %for.body |
| ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX1-NEXT: vcvtsi2ss %eax, %xmm4, %xmm2 |
| ; AVX1-NEXT: vcvtsi2ss %ecx, %xmm4, %xmm3 |
| ; AVX1-NEXT: vaddss %xmm2, %xmm0, %xmm0 |
| ; AVX1-NEXT: vaddss %xmm3, %xmm1, %xmm1 |
| ; AVX1-NEXT: incl %eax |
| ; AVX1-NEXT: decl %ecx |
| ; AVX1-NEXT: jne .LBB6_3 |
| ; AVX1-NEXT: # %bb.4: # %for.end |
| ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: retq |
| ; AVX1-NEXT: .LBB6_1: |
| ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 |
| ; AVX1-NEXT: retq |
| ; |
| ; AVX512VL-LABEL: loopdep1: |
| ; AVX512VL: # %bb.0: # %entry |
| ; AVX512VL-NEXT: testl %ecx, %ecx |
| ; AVX512VL-NEXT: je .LBB6_1 |
| ; AVX512VL-NEXT: # %bb.2: # %for.body.preheader |
| ; AVX512VL-NEXT: movl $1, %eax |
| ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; AVX512VL-NEXT: .p2align 4, 0x90 |
| ; AVX512VL-NEXT: .LBB6_3: # %for.body |
| ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX512VL-NEXT: vcvtsi2ss %eax, %xmm3, %xmm2 |
| ; AVX512VL-NEXT: vaddss %xmm2, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: vcvtsi2ss %ecx, %xmm3, %xmm2 |
| ; AVX512VL-NEXT: vaddss %xmm2, %xmm1, %xmm1 |
| ; AVX512VL-NEXT: incl %eax |
| ; AVX512VL-NEXT: decl %ecx |
| ; AVX512VL-NEXT: jne .LBB6_3 |
| ; AVX512VL-NEXT: # %bb.4: # %for.end |
| ; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: retq |
| ; AVX512VL-NEXT: .LBB6_1: |
| ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0 |
| ; AVX512VL-NEXT: retq |
| entry: |
| %tobool3 = icmp eq i32 %m, 0 |
| br i1 %tobool3, label %for.end, label %for.body |
| |
| for.body: ; preds = %entry, %for.body |
| %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] |
| %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] |
| %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] |
| %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] |
| %conv = sitofp i32 %n.04 to float |
| %add = fadd float %s1.06, %conv |
| %conv1 = sitofp i32 %m.addr.07 to float |
| %add2 = fadd float %s2.05, %conv1 |
| %inc = add nsw i32 %n.04, 1 |
| %dec = add nsw i32 %m.addr.07, -1 |
| %tobool = icmp eq i32 %dec, 0 |
| br i1 %tobool, label %for.end, label %for.body |
| |
| for.end: ; preds = %for.body, %entry |
| %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] |
| %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] |
| %sub = fsub float %s1.0.lcssa, %s2.0.lcssa |
| ret float %sub |
| } |
| |
| ; rdar:15221834 False AVX register dependencies cause 5x slowdown on |
| ; flops-6. Make sure the unused register read by vcvtsi2sd is zeroed |
| ; to avoid cyclic dependence on a write to the same register in a |
| ; previous iteration. |
| |
| define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { |
| ; SSE-LINUX-LABEL: loopdep2: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: movq (%rdi), %rax |
| ; SSE-LINUX-NEXT: movl $1, %ecx |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB7_1: # %loop |
| ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm0 |
| ; SSE-LINUX-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload |
| ; SSE-LINUX-NEXT: # xmm0 = mem[0],zero |
| ; SSE-LINUX-NEXT: addsd (%rsi), %xmm0 |
| ; SSE-LINUX-NEXT: cvttsd2si %xmm0, %rdx |
| ; SSE-LINUX-NEXT: addq %rdx, %rax |
| ; SSE-LINUX-NEXT: incq %rcx |
| ; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 |
| ; SSE-LINUX-NEXT: jne .LBB7_1 |
| ; SSE-LINUX-NEXT: # %bb.2: # %ret |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: loopdep2: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: subq $184, %rsp |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movq (%rcx), %rax |
| ; SSE-WIN-NEXT: movl $1, %r8d |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB7_1: # %loop |
| ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm0 |
| ; SSE-WIN-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload |
| ; SSE-WIN-NEXT: # xmm0 = mem[0],zero |
| ; SSE-WIN-NEXT: addsd (%rdx), %xmm0 |
| ; SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx |
| ; SSE-WIN-NEXT: addq %rcx, %rax |
| ; SSE-WIN-NEXT: incq %r8 |
| ; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 |
| ; SSE-WIN-NEXT: jne .LBB7_1 |
| ; SSE-WIN-NEXT: # %bb.2: # %ret |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $184, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX-LABEL: loopdep2: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: subq $184, %rsp |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: movq (%rcx), %rax |
| ; AVX-NEXT: movl $1, %r8d |
| ; AVX-NEXT: .p2align 4, 0x90 |
| ; AVX-NEXT: .LBB7_1: # %loop |
| ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 |
| ; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload |
| ; AVX-NEXT: # xmm0 = mem[0],zero |
| ; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0 |
| ; AVX-NEXT: vcvttsd2si %xmm0, %rcx |
| ; AVX-NEXT: addq %rcx, %rax |
| ; AVX-NEXT: incq %r8 |
| ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 |
| ; AVX-NEXT: jne .LBB7_1 |
| ; AVX-NEXT: # %bb.2: # %ret |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $184, %rsp |
| ; AVX-NEXT: retq |
| entry: |
| %vx = load i64, i64* %x |
| br label %loop |
| loop: |
| %i = phi i64 [ 1, %entry ], [ %inc, %loop ] |
| %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] |
| %fi = sitofp i64 %i to double |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| %vy = load double, double* %y |
| %fipy = fadd double %fi, %vy |
| %iipy = fptosi double %fipy to i64 |
| %s2 = add i64 %s1, %iipy |
| %inc = add nsw i64 %i, 1 |
| %exitcond = icmp eq i64 %inc, 156250000 |
| br i1 %exitcond, label %ret, label %loop |
| ret: |
| ret i64 %s2 |
| } |
| |
| ; This loop contains a cvtsi2sd instruction that has a loop-carried |
| ; false dependency on an xmm that is modified by other scalar instructions |
| ; that follow it in the loop. Additionally, the source of convert is a |
| ; memory operand. Verify the break false dependency fix pass breaks this |
| ; dependency by inserting a xor before the convert. |
| @x = common dso_local global [1024 x double] zeroinitializer, align 16 |
| @y = common dso_local global [1024 x double] zeroinitializer, align 16 |
| @z = common dso_local global [1024 x double] zeroinitializer, align 16 |
| @w = common dso_local global [1024 x double] zeroinitializer, align 16 |
| @v = common dso_local global [1024 x i32] zeroinitializer, align 16 |
| |
| define dso_local void @loopdep3() { |
| ; SSE-LINUX-LABEL: loopdep3: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: xorl %eax, %eax |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB8_1: # %for.cond1.preheader |
| ; SSE-LINUX-NEXT: # =>This Loop Header: Depth=1 |
| ; SSE-LINUX-NEXT: # Child Loop BB8_2 Depth 2 |
| ; SSE-LINUX-NEXT: movq $-4096, %rcx # imm = 0xF000 |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB8_2: # %for.body3 |
| ; SSE-LINUX-NEXT: # Parent Loop BB8_1 Depth=1 |
| ; SSE-LINUX-NEXT: # => This Inner Loop Header: Depth=2 |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: cvtsi2sdl v+4096(%rcx), %xmm0 |
| ; SSE-LINUX-NEXT: mulsd x+8192(%rcx,%rcx), %xmm0 |
| ; SSE-LINUX-NEXT: mulsd y+8192(%rcx,%rcx), %xmm0 |
| ; SSE-LINUX-NEXT: mulsd z+8192(%rcx,%rcx), %xmm0 |
| ; SSE-LINUX-NEXT: movsd %xmm0, w+8192(%rcx,%rcx) |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: addq $4, %rcx |
| ; SSE-LINUX-NEXT: jne .LBB8_2 |
| ; SSE-LINUX-NEXT: # %bb.3: # %for.inc14 |
| ; SSE-LINUX-NEXT: # in Loop: Header=BB8_1 Depth=1 |
| ; SSE-LINUX-NEXT: incl %eax |
| ; SSE-LINUX-NEXT: cmpl $100000, %eax # imm = 0x186A0 |
| ; SSE-LINUX-NEXT: jne .LBB8_1 |
| ; SSE-LINUX-NEXT: # %bb.4: # %for.end16 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: loopdep3: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: pushq %rsi |
| ; SSE-WIN-NEXT: .seh_pushreg %rsi |
| ; SSE-WIN-NEXT: subq $160, %rsp |
| ; SSE-WIN-NEXT: .seh_stackalloc 160 |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 |
| ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 |
| ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 |
| ; SSE-WIN-NEXT: .seh_endprologue |
| ; SSE-WIN-NEXT: xorl %r9d, %r9d |
| ; SSE-WIN-NEXT: leaq v(%rip), %r8 |
| ; SSE-WIN-NEXT: leaq x(%rip), %r10 |
| ; SSE-WIN-NEXT: leaq y(%rip), %r11 |
| ; SSE-WIN-NEXT: leaq z(%rip), %rax |
| ; SSE-WIN-NEXT: leaq w(%rip), %rdx |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader |
| ; SSE-WIN-NEXT: # =>This Loop Header: Depth=1 |
| ; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2 |
| ; SSE-WIN-NEXT: movq %r8, %rcx |
| ; SSE-WIN-NEXT: xorl %esi, %esi |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB8_2: # %for.body3 |
| ; SSE-WIN-NEXT: # Parent Loop BB8_1 Depth=1 |
| ; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2 |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: cvtsi2sdl (%rcx), %xmm0 |
| ; SSE-WIN-NEXT: mulsd (%rsi,%r10), %xmm0 |
| ; SSE-WIN-NEXT: mulsd (%rsi,%r11), %xmm0 |
| ; SSE-WIN-NEXT: mulsd (%rsi,%rax), %xmm0 |
| ; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%rdx) |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: addq $8, %rsi |
| ; SSE-WIN-NEXT: addq $4, %rcx |
| ; SSE-WIN-NEXT: cmpq $8192, %rsi # imm = 0x2000 |
| ; SSE-WIN-NEXT: jne .LBB8_2 |
| ; SSE-WIN-NEXT: # %bb.3: # %for.inc14 |
| ; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1 |
| ; SSE-WIN-NEXT: incl %r9d |
| ; SSE-WIN-NEXT: cmpl $100000, %r9d # imm = 0x186A0 |
| ; SSE-WIN-NEXT: jne .LBB8_1 |
| ; SSE-WIN-NEXT: # %bb.4: # %for.end16 |
| ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $160, %rsp |
| ; SSE-WIN-NEXT: popq %rsi |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .seh_endproc |
| ; |
| ; AVX-LABEL: loopdep3: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: pushq %rsi |
| ; AVX-NEXT: .seh_pushreg %rsi |
| ; AVX-NEXT: subq $160, %rsp |
| ; AVX-NEXT: .seh_stackalloc 160 |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm15, 144 |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm14, 128 |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm13, 112 |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm12, 96 |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm11, 80 |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm10, 64 |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm9, 48 |
| ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm8, 32 |
| ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm7, 16 |
| ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm6, 0 |
| ; AVX-NEXT: .seh_endprologue |
| ; AVX-NEXT: xorl %r9d, %r9d |
| ; AVX-NEXT: leaq v(%rip), %r8 |
| ; AVX-NEXT: leaq x(%rip), %r10 |
| ; AVX-NEXT: leaq y(%rip), %r11 |
| ; AVX-NEXT: leaq z(%rip), %rax |
| ; AVX-NEXT: leaq w(%rip), %rdx |
| ; AVX-NEXT: .p2align 4, 0x90 |
| ; AVX-NEXT: .LBB8_1: # %for.cond1.preheader |
| ; AVX-NEXT: # =>This Loop Header: Depth=1 |
| ; AVX-NEXT: # Child Loop BB8_2 Depth 2 |
| ; AVX-NEXT: movq %r8, %rcx |
| ; AVX-NEXT: xorl %esi, %esi |
| ; AVX-NEXT: .p2align 4, 0x90 |
| ; AVX-NEXT: .LBB8_2: # %for.body3 |
| ; AVX-NEXT: # Parent Loop BB8_1 Depth=1 |
| ; AVX-NEXT: # => This Inner Loop Header: Depth=2 |
| ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 |
| ; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 |
| ; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx) |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: addq $8, %rsi |
| ; AVX-NEXT: addq $4, %rcx |
| ; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000 |
| ; AVX-NEXT: jne .LBB8_2 |
| ; AVX-NEXT: # %bb.3: # %for.inc14 |
| ; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1 |
| ; AVX-NEXT: incl %r9d |
| ; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 |
| ; AVX-NEXT: jne .LBB8_1 |
| ; AVX-NEXT: # %bb.4: # %for.end16 |
| ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $160, %rsp |
| ; AVX-NEXT: popq %rsi |
| ; AVX-NEXT: retq |
| ; AVX-NEXT: .seh_endproc |
| entry: |
| br label %for.cond1.preheader |
| |
| for.cond1.preheader: ; preds = %for.inc14, %entry |
| %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ] |
| br label %for.body3 |
| |
| for.body3: |
| %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] |
| %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @v, i64 0, i64 %indvars.iv |
| %0 = load i32, i32* %arrayidx, align 4 |
| %conv = sitofp i32 %0 to double |
| %arrayidx5 = getelementptr inbounds [1024 x double], [1024 x double]* @x, i64 0, i64 %indvars.iv |
| %1 = load double, double* %arrayidx5, align 8 |
| %mul = fmul double %conv, %1 |
| %arrayidx7 = getelementptr inbounds [1024 x double], [1024 x double]* @y, i64 0, i64 %indvars.iv |
| %2 = load double, double* %arrayidx7, align 8 |
| %mul8 = fmul double %mul, %2 |
| %arrayidx10 = getelementptr inbounds [1024 x double], [1024 x double]* @z, i64 0, i64 %indvars.iv |
| %3 = load double, double* %arrayidx10, align 8 |
| %mul11 = fmul double %mul8, %3 |
| %arrayidx13 = getelementptr inbounds [1024 x double], [1024 x double]* @w, i64 0, i64 %indvars.iv |
| store double %mul11, double* %arrayidx13, align 8 |
| %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 |
| %exitcond = icmp eq i64 %indvars.iv.next, 1024 |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| br i1 %exitcond, label %for.inc14, label %for.body3 |
| |
| for.inc14: ; preds = %for.body3 |
| %inc15 = add nsw i32 %i.025, 1 |
| %exitcond26 = icmp eq i32 %inc15, 100000 |
| br i1 %exitcond26, label %for.end16, label %for.cond1.preheader |
| |
| for.end16: ; preds = %for.inc14 |
| ret void |
| |
| } |
| |
| define dso_local double @inlineasmdep(i64 %arg) { |
| ; SSE-LINUX-LABEL: inlineasmdep: |
| ; SSE-LINUX: # %bb.0: # %top |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: inlineasmdep: |
| ; SSE-WIN: # %bb.0: # %top |
| ; SSE-WIN-NEXT: subq $168, %rsp |
| ; SSE-WIN-NEXT: .seh_stackalloc 168 |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 |
| ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 |
| ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 |
| ; SSE-WIN-NEXT: .seh_endprologue |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0 |
| ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $168, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .seh_endproc |
| ; |
| ; AVX-LABEL: inlineasmdep: |
| ; AVX: # %bb.0: # %top |
| ; AVX-NEXT: subq $168, %rsp |
| ; AVX-NEXT: .seh_stackalloc 168 |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm15, 144 |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm14, 128 |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm13, 112 |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm12, 96 |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm11, 80 |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm10, 64 |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm9, 48 |
| ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm8, 32 |
| ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm7, 16 |
| ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm6, 0 |
| ; AVX-NEXT: .seh_endprologue |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $168, %rsp |
| ; AVX-NEXT: retq |
| ; AVX-NEXT: .seh_endproc |
| top: |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| %tmp1 = sitofp i64 %arg to double |
| ret double %tmp1 |
| } |
| |
| ; Make sure we are making a smart choice regarding undef registers and |
| ; hiding the false dependency behind a true dependency |
| define dso_local double @truedeps(float %arg) { |
| ; SSE-LINUX-LABEL: truedeps: |
| ; SSE-LINUX: # %bb.0: # %top |
| ; SSE-LINUX-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload |
| ; SSE-LINUX-NEXT: # xmm0 = mem[0],zero,zero,zero |
| ; SSE-LINUX-NEXT: cvtss2sd %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: truedeps: |
| ; SSE-WIN: # %bb.0: # %top |
| ; SSE-WIN-NEXT: subq $184, %rsp |
| ; SSE-WIN-NEXT: .seh_stackalloc 184 |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 160 |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 144 |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 128 |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 112 |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 96 |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 80 |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 64 |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 48 |
| ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 32 |
| ; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 16 |
| ; SSE-WIN-NEXT: .seh_endprologue |
| ; SSE-WIN-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload |
| ; SSE-WIN-NEXT: # xmm0 = mem[0],zero,zero,zero |
| ; SSE-WIN-NEXT: cvtss2sd %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $184, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .seh_endproc |
| ; |
| ; AVX-LABEL: truedeps: |
| ; AVX: # %bb.0: # %top |
| ; AVX-NEXT: subq $184, %rsp |
| ; AVX-NEXT: .seh_stackalloc 184 |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm15, 160 |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm14, 144 |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm13, 128 |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm12, 112 |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm11, 96 |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm10, 80 |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm9, 64 |
| ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm8, 48 |
| ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm7, 32 |
| ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm6, 16 |
| ; AVX-NEXT: .seh_endprologue |
| ; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload |
| ; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero |
| ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $184, %rsp |
| ; AVX-NEXT: retq |
| ; AVX-NEXT: .seh_endproc |
| top: |
| tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| %tmp1 = fpext float %arg to double |
| ret double %tmp1 |
| } |
| |
| ; Make sure we are making a smart choice regarding undef registers and |
| ; choosing the register with the highest clearence |
| define dso_local double @clearence(i64 %arg) { |
| ; SSE-LINUX-LABEL: clearence: |
| ; SSE-LINUX: # %bb.0: # %top |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0 |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: clearence: |
| ; SSE-WIN: # %bb.0: # %top |
| ; SSE-WIN-NEXT: subq $168, %rsp |
| ; SSE-WIN-NEXT: .seh_stackalloc 168 |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 |
| ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 |
| ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 |
| ; SSE-WIN-NEXT: .seh_endprologue |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: xorps %xmm0, %xmm0 |
| ; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0 |
| ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $168, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .seh_endproc |
| ; |
| ; AVX-LABEL: clearence: |
| ; AVX: # %bb.0: # %top |
| ; AVX-NEXT: subq $168, %rsp |
| ; AVX-NEXT: .seh_stackalloc 168 |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm15, 144 |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm14, 128 |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm13, 112 |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm12, 96 |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm11, 80 |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm10, 64 |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm9, 48 |
| ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm8, 32 |
| ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm7, 16 |
| ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill |
| ; AVX-NEXT: .seh_savexmm %xmm6, 0 |
| ; AVX-NEXT: .seh_endprologue |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6 |
| ; AVX-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 |
| ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $168, %rsp |
| ; AVX-NEXT: retq |
| ; AVX-NEXT: .seh_endproc |
| top: |
| tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| %tmp1 = sitofp i64 %arg to double |
| ret double %tmp1 |
| } |
| |
| ; Make sure we are making a smart choice regarding undef registers in order to |
| ; avoid a cyclic dependence on a write to the same register in a previous |
| ; iteration, especially when we cannot zero out the undef register because it |
| ; is alive. |
| define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { |
| ; SSE-LINUX-LABEL: loopclearence: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: movq (%rdi), %rax |
| ; SSE-LINUX-NEXT: movl $1, %ecx |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB12_1: # %loop |
| ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm4 |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: addsd (%rsi), %xmm4 |
| ; SSE-LINUX-NEXT: cvttsd2si %xmm4, %rdx |
| ; SSE-LINUX-NEXT: addq %rdx, %rax |
| ; SSE-LINUX-NEXT: incq %rcx |
| ; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 |
| ; SSE-LINUX-NEXT: jne .LBB12_1 |
| ; SSE-LINUX-NEXT: # %bb.2: # %ret |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: loopclearence: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: subq $136, %rsp |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill |
| ; SSE-WIN-NEXT: movq (%rcx), %rax |
| ; SSE-WIN-NEXT: movl $1, %r8d |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB12_1: # %loop |
| ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm4 |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: addsd (%rdx), %xmm4 |
| ; SSE-WIN-NEXT: cvttsd2si %xmm4, %rcx |
| ; SSE-WIN-NEXT: addq %rcx, %rax |
| ; SSE-WIN-NEXT: incq %r8 |
| ; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 |
| ; SSE-WIN-NEXT: jne .LBB12_1 |
| ; SSE-WIN-NEXT: # %bb.2: # %ret |
| ; SSE-WIN-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $136, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; |
| ; AVX-LABEL: loopclearence: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: subq $136, %rsp |
| ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill |
| ; AVX-NEXT: movq (%rcx), %rax |
| ; AVX-NEXT: movl $1, %r8d |
| ; AVX-NEXT: .p2align 4, 0x90 |
| ; AVX-NEXT: .LBB12_1: # %loop |
| ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: #APP |
| ; AVX-NEXT: #NO_APP |
| ; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0 |
| ; AVX-NEXT: vcvttsd2si %xmm0, %rcx |
| ; AVX-NEXT: addq %rcx, %rax |
| ; AVX-NEXT: incq %r8 |
| ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 |
| ; AVX-NEXT: jne .LBB12_1 |
| ; AVX-NEXT: # %bb.2: # %ret |
| ; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX-NEXT: addq $136, %rsp |
| ; AVX-NEXT: retq |
| entry: |
| %vx = load i64, i64* %x |
| br label %loop |
| loop: |
| %i = phi i64 [ 1, %entry ], [ %inc, %loop ] |
| %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] |
| %fi = sitofp i64 %i to double |
| tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| %vy = load double, double* %y |
| %fipy = fadd double %fi, %vy |
| %iipy = fptosi double %fipy to i64 |
| %s2 = add i64 %s1, %iipy |
| %inc = add nsw i64 %i, 1 |
| %exitcond = icmp eq i64 %inc, 156250000 |
| br i1 %exitcond, label %ret, label %loop |
| ret: |
| ret i64 %s2 |
| } |
| |
| ; Make sure we are making a smart choice regarding undef registers even for more |
| ; complicated loop structures. This example is the inner loop from |
| ; julia> a = falses(10000); a[1:4:end] = true |
| ; julia> linspace(1.0,2.0,10000)[a] |
| define dso_local void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { |
| ; SSE-LINUX-LABEL: loopclearance2: |
| ; SSE-LINUX: # %bb.0: # %entry |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: #APP |
| ; SSE-LINUX-NEXT: #NO_APP |
| ; SSE-LINUX-NEXT: movl $1, %r8d |
| ; SSE-LINUX-NEXT: xorl %ecx, %ecx |
| ; SSE-LINUX-NEXT: .p2align 4, 0x90 |
| ; SSE-LINUX-NEXT: .LBB13_1: # %inner_loop |
| ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-LINUX-NEXT: movq %rcx, %rax |
| ; SSE-LINUX-NEXT: shrq $6, %rcx |
| ; SSE-LINUX-NEXT: movq (%rsi,%rcx,8), %rcx |
| ; SSE-LINUX-NEXT: btq %rax, %rcx |
| ; SSE-LINUX-NEXT: leaq 1(%rax), %rcx |
| ; SSE-LINUX-NEXT: jae .LBB13_1 |
| ; SSE-LINUX-NEXT: # %bb.2: # %loop_end |
| ; SSE-LINUX-NEXT: # in Loop: Header=BB13_1 Depth=1 |
| ; SSE-LINUX-NEXT: leaq 1(%r8), %r9 |
| ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4 |
| ; SSE-LINUX-NEXT: movapd %xmm0, %xmm5 |
| ; SSE-LINUX-NEXT: subsd %xmm4, %xmm5 |
| ; SSE-LINUX-NEXT: mulsd %xmm1, %xmm5 |
| ; SSE-LINUX-NEXT: leaq -1(%rcx), %rax |
| ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-LINUX-NEXT: cvtsi2sd %rax, %xmm4 |
| ; SSE-LINUX-NEXT: mulsd %xmm2, %xmm4 |
| ; SSE-LINUX-NEXT: addsd %xmm5, %xmm4 |
| ; SSE-LINUX-NEXT: divsd %xmm3, %xmm4 |
| ; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%r8,8) |
| ; SSE-LINUX-NEXT: movq %r9, %r8 |
| ; SSE-LINUX-NEXT: cmpq %r9, %rdx |
| ; SSE-LINUX-NEXT: jge .LBB13_1 |
| ; SSE-LINUX-NEXT: # %bb.3: # %loopdone |
| ; SSE-LINUX-NEXT: retq |
| ; |
| ; SSE-WIN-LABEL: loopclearance2: |
| ; SSE-WIN: # %bb.0: # %entry |
| ; SSE-WIN-NEXT: subq $152, %rsp |
| ; SSE-WIN-NEXT: .seh_stackalloc 152 |
| ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 128 |
| ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 112 |
| ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 96 |
| ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 80 |
| ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 64 |
| ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 48 |
| ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 32 |
| ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 16 |
| ; SSE-WIN-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill |
| ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 0 |
| ; SSE-WIN-NEXT: .seh_endprologue |
| ; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %r8 |
| ; SSE-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero |
| ; SSE-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: #APP |
| ; SSE-WIN-NEXT: #NO_APP |
| ; SSE-WIN-NEXT: movl $1, %r9d |
| ; SSE-WIN-NEXT: xorl %r11d, %r11d |
| ; SSE-WIN-NEXT: .p2align 4, 0x90 |
| ; SSE-WIN-NEXT: .LBB13_1: # %inner_loop |
| ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; SSE-WIN-NEXT: movq %r11, %r10 |
| ; SSE-WIN-NEXT: movq %r11, %rax |
| ; SSE-WIN-NEXT: shrq $6, %rax |
| ; SSE-WIN-NEXT: movq (%rdx,%rax,8), %rax |
| ; SSE-WIN-NEXT: btq %r11, %rax |
| ; SSE-WIN-NEXT: leaq 1(%r11), %r11 |
| ; SSE-WIN-NEXT: jae .LBB13_1 |
| ; SSE-WIN-NEXT: # %bb.2: # %loop_end |
| ; SSE-WIN-NEXT: # in Loop: Header=BB13_1 Depth=1 |
| ; SSE-WIN-NEXT: leaq 1(%r9), %r10 |
| ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-WIN-NEXT: cvtsi2sd %r10, %xmm4 |
| ; SSE-WIN-NEXT: movapd %xmm2, %xmm5 |
| ; SSE-WIN-NEXT: subsd %xmm4, %xmm5 |
| ; SSE-WIN-NEXT: mulsd %xmm3, %xmm5 |
| ; SSE-WIN-NEXT: leaq -1(%r11), %rax |
| ; SSE-WIN-NEXT: xorps %xmm4, %xmm4 |
| ; SSE-WIN-NEXT: cvtsi2sd %rax, %xmm4 |
| ; SSE-WIN-NEXT: mulsd %xmm1, %xmm4 |
| ; SSE-WIN-NEXT: addsd %xmm5, %xmm4 |
| ; SSE-WIN-NEXT: divsd %xmm0, %xmm4 |
| ; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r9,8) |
| ; SSE-WIN-NEXT: movq %r10, %r9 |
| ; SSE-WIN-NEXT: cmpq %r10, %r8 |
| ; SSE-WIN-NEXT: jge .LBB13_1 |
| ; SSE-WIN-NEXT: # %bb.3: # %loopdone |
| ; SSE-WIN-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; SSE-WIN-NEXT: addq $152, %rsp |
| ; SSE-WIN-NEXT: retq |
| ; SSE-WIN-NEXT: .seh_endproc |
| ; |
| ; AVX1-LABEL: loopclearance2: |
| ; AVX1: # %bb.0: # %entry |
| ; AVX1-NEXT: subq $152, %rsp |
| ; AVX1-NEXT: .seh_stackalloc 152 |
| ; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm15, 128 |
| ; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm14, 112 |
| ; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm13, 96 |
| ; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm12, 80 |
| ; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm11, 64 |
| ; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm10, 48 |
| ; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm9, 32 |
| ; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm8, 16 |
| ; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill |
| ; AVX1-NEXT: .seh_savexmm %xmm7, 0 |
| ; AVX1-NEXT: .seh_endprologue |
| ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8 |
| ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: #APP |
| ; AVX1-NEXT: #NO_APP |
| ; AVX1-NEXT: movl $1, %r9d |
| ; AVX1-NEXT: xorl %r11d, %r11d |
| ; AVX1-NEXT: .p2align 4, 0x90 |
| ; AVX1-NEXT: .LBB13_1: # %inner_loop |
| ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX1-NEXT: movq %r11, %r10 |
| ; AVX1-NEXT: movq %r11, %rax |
| ; AVX1-NEXT: shrq $6, %rax |
| ; AVX1-NEXT: movq (%rdx,%rax,8), %rax |
| ; AVX1-NEXT: btq %r11, %rax |
| ; AVX1-NEXT: leaq 1(%r11), %r11 |
| ; AVX1-NEXT: jae .LBB13_1 |
| ; AVX1-NEXT: # %bb.2: # %loop_end |
| ; AVX1-NEXT: # in Loop: Header=BB13_1 Depth=1 |
| ; AVX1-NEXT: leaq 1(%r9), %r10 |
| ; AVX1-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 |
| ; AVX1-NEXT: vsubsd %xmm4, %xmm2, %xmm4 |
| ; AVX1-NEXT: vmulsd %xmm3, %xmm4, %xmm4 |
| ; AVX1-NEXT: leaq -1(%r11), %rax |
| ; AVX1-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 |
| ; AVX1-NEXT: vmulsd %xmm1, %xmm5, %xmm5 |
| ; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 |
| ; AVX1-NEXT: vdivsd %xmm0, %xmm4, %xmm4 |
| ; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) |
| ; AVX1-NEXT: movq %r10, %r9 |
| ; AVX1-NEXT: cmpq %r10, %r8 |
| ; AVX1-NEXT: jge .LBB13_1 |
| ; AVX1-NEXT: # %bb.3: # %loopdone |
| ; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX1-NEXT: addq $152, %rsp |
| ; AVX1-NEXT: retq |
| ; AVX1-NEXT: .seh_endproc |
| ; |
| ; AVX512VL-LABEL: loopclearance2: |
| ; AVX512VL: # %bb.0: # %entry |
| ; AVX512VL-NEXT: subq $152, %rsp |
| ; AVX512VL-NEXT: .seh_stackalloc 152 |
| ; AVX512VL-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm15, 128 |
| ; AVX512VL-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm14, 112 |
| ; AVX512VL-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm13, 96 |
| ; AVX512VL-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm12, 80 |
| ; AVX512VL-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm11, 64 |
| ; AVX512VL-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm10, 48 |
| ; AVX512VL-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm9, 32 |
| ; AVX512VL-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm8, 16 |
| ; AVX512VL-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill |
| ; AVX512VL-NEXT: .seh_savexmm %xmm7, 0 |
| ; AVX512VL-NEXT: .seh_endprologue |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r8 |
| ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: #APP |
| ; AVX512VL-NEXT: #NO_APP |
| ; AVX512VL-NEXT: movl $1, %r9d |
| ; AVX512VL-NEXT: xorl %r11d, %r11d |
| ; AVX512VL-NEXT: .p2align 4, 0x90 |
| ; AVX512VL-NEXT: .LBB13_1: # %inner_loop |
| ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; AVX512VL-NEXT: movq %r11, %r10 |
| ; AVX512VL-NEXT: movq %r11, %rax |
| ; AVX512VL-NEXT: shrq $6, %rax |
| ; AVX512VL-NEXT: movq (%rdx,%rax,8), %rax |
| ; AVX512VL-NEXT: btq %r11, %rax |
| ; AVX512VL-NEXT: leaq 1(%r11), %r11 |
| ; AVX512VL-NEXT: jae .LBB13_1 |
| ; AVX512VL-NEXT: # %bb.2: # %loop_end |
| ; AVX512VL-NEXT: # in Loop: Header=BB13_1 Depth=1 |
| ; AVX512VL-NEXT: leaq 1(%r9), %r10 |
| ; AVX512VL-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 |
| ; AVX512VL-NEXT: vsubsd %xmm4, %xmm2, %xmm4 |
| ; AVX512VL-NEXT: vmulsd %xmm3, %xmm4, %xmm4 |
| ; AVX512VL-NEXT: leaq -1(%r11), %rax |
| ; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 |
| ; AVX512VL-NEXT: vmulsd %xmm1, %xmm5, %xmm5 |
| ; AVX512VL-NEXT: vaddsd %xmm5, %xmm4, %xmm4 |
| ; AVX512VL-NEXT: vdivsd %xmm0, %xmm4, %xmm4 |
| ; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) |
| ; AVX512VL-NEXT: movq %r10, %r9 |
| ; AVX512VL-NEXT: cmpq %r10, %r8 |
| ; AVX512VL-NEXT: jge .LBB13_1 |
| ; AVX512VL-NEXT: # %bb.3: # %loopdone |
| ; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload |
| ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload |
| ; AVX512VL-NEXT: addq $152, %rsp |
| ; AVX512VL-NEXT: retq |
| ; AVX512VL-NEXT: .seh_endproc |
| entry: |
| tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() |
| tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() |
| br label %loop |
| |
| loop: |
| %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] |
| %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ] |
| %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ] |
| br label %inner_loop |
| |
| inner_loop: |
| %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ] |
| %idx = lshr i64 %phi, 6 |
| %inputptr = getelementptr i64, i64* %x, i64 %idx |
| %input = load i64, i64* %inputptr, align 8 |
| %masked = and i64 %phi, 63 |
| %shiftedmasked = shl i64 1, %masked |
| %maskedinput = and i64 %input, %shiftedmasked |
| %cmp = icmp eq i64 %maskedinput, 0 |
| %nextk = add i64 %phi, 1 |
| br i1 %cmp, label %inner_loop, label %loop_end |
| |
| loop_end: |
| %nexti = add i64 %phi_i, 1 |
| %nextj = add i64 %phi_j, 1 |
| ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as |
| ; the only reasonable choice. The primary thing we care about is that it's |
| ; not one of the registers used in the loop (e.g. not the output reg here) |
| %nexti_f = sitofp i64 %nexti to double |
| %sub = fsub double %c1, %nexti_f |
| %mul = fmul double %sub, %c2 |
| %phi_f = sitofp i64 %phi to double |
| %mul2 = fmul double %phi_f, %c3 |
| %add2 = fadd double %mul, %mul2 |
| %div = fdiv double %add2, %c4 |
| %prev_j = add i64 %phi_j, -1 |
| %outptr = getelementptr double, double* %y, i64 %prev_j |
| store double %div, double* %outptr, align 8 |
| %done = icmp slt i64 %size, %nexti |
| br i1 %done, label %loopdone, label %loop |
| |
| loopdone: |
| ret void |
| } |