| # Test performs a BB reordering with unsupported |
| # instruction jrcxz. Reordering works correctly with the |
| # follow options: None, Normal or Reverse. Other strategies |
| # are completed with Assertion `isIntN(Size * 8 + 1, Value). |
| # The cause is the distance between BB where one contains |
| # jrcxz instruction. |
| # Example: OpenSSL |
| # https://github.com/openssl/openssl/blob/master/crypto/bn/asm/x86_64-mont5.pl#L3319 |
| |
| # REQUIRES: system-linux |
| |
| # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ |
| # RUN: %s -o %t.o |
| # RUN: link_fdata %s %t.o %t.fdata |
| # RUN: %clang %cflags %t.o -falign-labels -march=native -o %t.exe -Wl,-q |
| |
| # RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ |
| # RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort \ |
| # RUN: --split-functions --split-all-cold --split-eh --dyno-stats \ |
| # RUN: --print-finalized 2>&1 | FileCheck %s |
| |
| # CHECK-NOT: value of -2105 is too large for field of 1 byte. |
| |
| .text |
| .section .text.startup,"ax",@progbits |
| .p2align 5,,31 |
| .globl main |
| .type main, @function |
| main: |
| jmp bn_sqrx8x_internal |
| |
| .globl bn_sqrx8x_internal |
| .hidden bn_sqrx8x_internal |
| .type bn_sqrx8x_internal,@function |
| .align 32 |
| bn_sqrx8x_internal: |
| __bn_sqrx8x_internal: |
| # FDATA: 1 bn_from_mont8x 160 1 bn_sqrx8x_internal 0 0 56 |
| # FDATA: 1 bn_sqrx8x_internal 13 1 bn_sqrx8x_internal 40 0 60972 |
| # FDATA: 1 bn_sqrx8x_internal 5f 1 bn_sqrx8x_internal 2c 0 60972 |
| # FDATA: 1 bn_sqrx8x_internal 2f1 1 bn_sqrx8x_internal 500 0 60972 |
| # FDATA: 1 bn_sqrx8x_internal 34a 1 bn_sqrx8x_internal 360 0 60972 |
| # FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 360 0 447888 |
| # FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 417 0 63984 |
| # FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 480 0 60972 |
| # FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 429 0 3012 |
| # FDATA: 1 bn_sqrx8x_internal 467 1 bn_sqrx8x_internal 360 0 3012 |
| # FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 80 0 58964 |
| # FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 4c0 0 2008 |
| # FDATA: 1 bn_sqrx8x_internal 4fb 1 bn_sqrx8x_internal 80 0 2008 |
| # FDATA: 1 bn_sqrx8x_internal 5f0 1 bn_sqrx8x_internal 5f2 0 180908 |
| # FDATA: 1 bn_sqrx8x_internal 61b 1 bn_sqrx8x_internal 540 0 180908 |
| # FDATA: 1 bn_sqrx8x_internal 632 1 bn_sqrx8x_internal 637 0 59020 |
| # FDATA: 1 bn_sqrx8x_internal 657 1 bn_sqrx8x_internal 660 0 59020 |
| # FDATA: 1 bn_sqrx8x_internal 696 1 bn_sqrx8x_internal 6a0 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 6a0 0 840336 |
| # FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 760 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 768 1 bn_sqrx8x_internal 76e 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 7b2 1 bn_sqrx8x_internal 7c0 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 7c0 0 896560 |
| # FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 874 0 128080 |
| # FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 8c0 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 87b 0 8032 |
| # FDATA: 1 bn_sqrx8x_internal 8bb 1 bn_sqrx8x_internal 7c0 0 8032 |
| # FDATA: 1 bn_sqrx8x_internal 8e8 1 bn_sqrx8x_internal 8ed 0 120048 |
| # FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 660 0 61028 |
| # FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 95b 0 59020 |
| # FDATA: 0 [unknown] 0 1 bn_sqrx8x_internal 5f0 0 59020 |
| .cfi_startproc |
| leaq 48+8(%rsp),%rdi |
| leaq (%rsi,%r9,1),%rbp |
| movq %r9,0+8(%rsp) |
| movq %rbp,8+8(%rsp) |
| jmp .Lsqr8x_zero_start |
| |
| .align 32 |
| .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 |
| .Lsqrx8x_zero: |
| .byte 0x3e |
| movdqa %xmm0,0(%rdi) |
| movdqa %xmm0,16(%rdi) |
| movdqa %xmm0,32(%rdi) |
| movdqa %xmm0,48(%rdi) |
| .Lsqr8x_zero_start: |
| movdqa %xmm0,64(%rdi) |
| movdqa %xmm0,80(%rdi) |
| movdqa %xmm0,96(%rdi) |
| movdqa %xmm0,112(%rdi) |
| leaq 128(%rdi),%rdi |
| subq $64,%r9 |
| jnz .Lsqrx8x_zero |
| |
| movq 0(%rsi),%rdx |
| |
| xorq %r10,%r10 |
| xorq %r11,%r11 |
| xorq %r12,%r12 |
| xorq %r13,%r13 |
| xorq %r14,%r14 |
| xorq %r15,%r15 |
| leaq 48+8(%rsp),%rdi |
| xorq %rbp,%rbp |
| jmp .Lsqrx8x_outer_loop |
| |
| .align 32 |
| .Lsqrx8x_outer_loop: |
| mulxq 8(%rsi),%r8,%rax |
| adcxq %r9,%r8 |
| adoxq %rax,%r10 |
| mulxq 16(%rsi),%r9,%rax |
| adcxq %r10,%r9 |
| adoxq %rax,%r11 |
| .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 |
| adcxq %r11,%r10 |
| adoxq %rax,%r12 |
| .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 |
| adcxq %r12,%r11 |
| adoxq %rax,%r13 |
| mulxq 40(%rsi),%r12,%rax |
| adcxq %r13,%r12 |
| adoxq %rax,%r14 |
| mulxq 48(%rsi),%r13,%rax |
| adcxq %r14,%r13 |
| adoxq %r15,%rax |
| mulxq 56(%rsi),%r14,%r15 |
| movq 8(%rsi),%rdx |
| adcxq %rax,%r14 |
| adoxq %rbp,%r15 |
| adcq 64(%rdi),%r15 |
| movq %r8,8(%rdi) |
| movq %r9,16(%rdi) |
| sbbq %rcx,%rcx |
| xorq %rbp,%rbp |
| |
| mulxq 16(%rsi),%r8,%rbx |
| mulxq 24(%rsi),%r9,%rax |
| adcxq %r10,%r8 |
| adoxq %rbx,%r9 |
| mulxq 32(%rsi),%r10,%rbx |
| adcxq %r11,%r9 |
| adoxq %rax,%r10 |
| .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 |
| adcxq %r12,%r10 |
| adoxq %rbx,%r11 |
| .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 |
| adcxq %r13,%r11 |
| adoxq %r14,%r12 |
| .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 |
| movq 16(%rsi),%rdx |
| adcxq %rax,%r12 |
| adoxq %rbx,%r13 |
| adcxq %r15,%r13 |
| adoxq %rbp,%r14 |
| adcxq %rbp,%r14 |
| |
| movq %r8,24(%rdi) |
| movq %r9,32(%rdi) |
| |
| mulxq 24(%rsi),%r8,%rbx |
| mulxq 32(%rsi),%r9,%rax |
| adcxq %r10,%r8 |
| adoxq %rbx,%r9 |
| mulxq 40(%rsi),%r10,%rbx |
| adcxq %r11,%r9 |
| adoxq %rax,%r10 |
| .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 |
| adcxq %r12,%r10 |
| adoxq %r13,%r11 |
| .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 |
| .byte 0x3e |
| movq 24(%rsi),%rdx |
| adcxq %rbx,%r11 |
| adoxq %rax,%r12 |
| adcxq %r14,%r12 |
| movq %r8,40(%rdi) |
| movq %r9,48(%rdi) |
| mulxq 32(%rsi),%r8,%rax |
| adoxq %rbp,%r13 |
| adcxq %rbp,%r13 |
| |
| mulxq 40(%rsi),%r9,%rbx |
| adcxq %r10,%r8 |
| adoxq %rax,%r9 |
| mulxq 48(%rsi),%r10,%rax |
| adcxq %r11,%r9 |
| adoxq %r12,%r10 |
| mulxq 56(%rsi),%r11,%r12 |
| movq 32(%rsi),%rdx |
| movq 40(%rsi),%r14 |
| adcxq %rbx,%r10 |
| adoxq %rax,%r11 |
| movq 48(%rsi),%r15 |
| adcxq %r13,%r11 |
| adoxq %rbp,%r12 |
| adcxq %rbp,%r12 |
| |
| movq %r8,56(%rdi) |
| movq %r9,64(%rdi) |
| |
| mulxq %r14,%r9,%rax |
| movq 56(%rsi),%r8 |
| adcxq %r10,%r9 |
| mulxq %r15,%r10,%rbx |
| adoxq %rax,%r10 |
| adcxq %r11,%r10 |
| mulxq %r8,%r11,%rax |
| movq %r14,%rdx |
| adoxq %rbx,%r11 |
| adcxq %r12,%r11 |
| |
| adcxq %rbp,%rax |
| |
| mulxq %r15,%r14,%rbx |
| mulxq %r8,%r12,%r13 |
| movq %r15,%rdx |
| leaq 64(%rsi),%rsi |
| adcxq %r14,%r11 |
| adoxq %rbx,%r12 |
| adcxq %rax,%r12 |
| adoxq %rbp,%r13 |
| |
| .byte 0x67,0x67 |
| mulxq %r8,%r8,%r14 |
| adcxq %r8,%r13 |
| adcxq %rbp,%r14 |
| |
| cmpq 8+8(%rsp),%rsi |
| je .Lsqrx8x_outer_break |
| |
| negq %rcx |
| movq $-8,%rcx |
| movq %rbp,%r15 |
| movq 64(%rdi),%r8 |
| adcxq 72(%rdi),%r9 |
| adcxq 80(%rdi),%r10 |
| adcxq 88(%rdi),%r11 |
| adcq 96(%rdi),%r12 |
| adcq 104(%rdi),%r13 |
| adcq 112(%rdi),%r14 |
| adcq 120(%rdi),%r15 |
| leaq (%rsi),%rbp |
| leaq 128(%rdi),%rdi |
| sbbq %rax,%rax |
| |
| movq -64(%rsi),%rdx |
| movq %rax,16+8(%rsp) |
| movq %rdi,24+8(%rsp) |
| |
| |
| xorl %eax,%eax |
| jmp .Lsqrx8x_loop |
| |
| .align 32 |
| .Lsqrx8x_loop: |
| movq %r8,%rbx |
| mulxq 0(%rbp),%rax,%r8 |
| adcxq %rax,%rbx |
| adoxq %r9,%r8 |
| |
| mulxq 8(%rbp),%rax,%r9 |
| adcxq %rax,%r8 |
| adoxq %r10,%r9 |
| |
| mulxq 16(%rbp),%rax,%r10 |
| adcxq %rax,%r9 |
| adoxq %r11,%r10 |
| |
| mulxq 24(%rbp),%rax,%r11 |
| adcxq %rax,%r10 |
| adoxq %r12,%r11 |
| |
| .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 |
| adcxq %rax,%r11 |
| adoxq %r13,%r12 |
| |
| mulxq 40(%rbp),%rax,%r13 |
| adcxq %rax,%r12 |
| adoxq %r14,%r13 |
| |
| mulxq 48(%rbp),%rax,%r14 |
| movq %rbx,(%rdi,%rcx,8) |
| movl $0,%ebx |
| adcxq %rax,%r13 |
| adoxq %r15,%r14 |
| |
| .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 |
| movq 8(%rsi,%rcx,8),%rdx |
| adcxq %rax,%r14 |
| adoxq %rbx,%r15 |
| adcxq %rbx,%r15 |
| |
| .byte 0x67 |
| incq %rcx |
| jnz .Lsqrx8x_loop |
| |
| leaq 64(%rbp),%rbp |
| movq $-8,%rcx |
| cmpq 8+8(%rsp),%rbp |
| je .Lsqrx8x_break |
| |
| subq 16+8(%rsp),%rbx |
| .byte 0x66 |
| movq -64(%rsi),%rdx |
| adcxq 0(%rdi),%r8 |
| adcxq 8(%rdi),%r9 |
| adcq 16(%rdi),%r10 |
| adcq 24(%rdi),%r11 |
| adcq 32(%rdi),%r12 |
| adcq 40(%rdi),%r13 |
| adcq 48(%rdi),%r14 |
| adcq 56(%rdi),%r15 |
| leaq 64(%rdi),%rdi |
| .byte 0x67 |
| sbbq %rax,%rax |
| xorl %ebx,%ebx |
| movq %rax,16+8(%rsp) |
| jmp .Lsqrx8x_loop |
| |
| .align 32 |
| .Lsqrx8x_break: |
| xorq %rbp,%rbp |
| subq 16+8(%rsp),%rbx |
| adcxq %rbp,%r8 |
| movq 24+8(%rsp),%rcx |
| adcxq %rbp,%r9 |
| movq 0(%rsi),%rdx |
| adcq $0,%r10 |
| movq %r8,0(%rdi) |
| adcq $0,%r11 |
| adcq $0,%r12 |
| adcq $0,%r13 |
| adcq $0,%r14 |
| adcq $0,%r15 |
| cmpq %rcx,%rdi |
| je .Lsqrx8x_outer_loop |
| |
| movq %r9,8(%rdi) |
| movq 8(%rcx),%r9 |
| movq %r10,16(%rdi) |
| movq 16(%rcx),%r10 |
| movq %r11,24(%rdi) |
| movq 24(%rcx),%r11 |
| movq %r12,32(%rdi) |
| movq 32(%rcx),%r12 |
| movq %r13,40(%rdi) |
| movq 40(%rcx),%r13 |
| movq %r14,48(%rdi) |
| movq 48(%rcx),%r14 |
| movq %r15,56(%rdi) |
| movq 56(%rcx),%r15 |
| movq %rcx,%rdi |
| jmp .Lsqrx8x_outer_loop |
| |
| .align 32 |
| .Lsqrx8x_outer_break: |
| movq %r9,72(%rdi) |
| .byte 102,72,15,126,217 |
| movq %r10,80(%rdi) |
| movq %r11,88(%rdi) |
| movq %r12,96(%rdi) |
| movq %r13,104(%rdi) |
| movq %r14,112(%rdi) |
| leaq 48+8(%rsp),%rdi |
| movq (%rsi,%rcx,1),%rdx |
| |
| movq 8(%rdi),%r11 |
| xorq %r10,%r10 |
| movq 0+8(%rsp),%r9 |
| adoxq %r11,%r11 |
| movq 16(%rdi),%r12 |
| movq 24(%rdi),%r13 |
| |
| .align 32 |
| .Lsqrx4x_shift_n_add: |
| mulxq %rdx,%rax,%rbx |
| adoxq %r12,%r12 |
| adcxq %r10,%rax |
| .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 |
| .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 |
| adoxq %r13,%r13 |
| adcxq %r11,%rbx |
| movq 40(%rdi),%r11 |
| movq %rax,0(%rdi) |
| movq %rbx,8(%rdi) |
| |
| mulxq %rdx,%rax,%rbx |
| adoxq %r10,%r10 |
| adcxq %r12,%rax |
| movq 16(%rsi,%rcx,1),%rdx |
| movq 48(%rdi),%r12 |
| adoxq %r11,%r11 |
| adcxq %r13,%rbx |
| movq 56(%rdi),%r13 |
| movq %rax,16(%rdi) |
| movq %rbx,24(%rdi) |
| |
| mulxq %rdx,%rax,%rbx |
| adoxq %r12,%r12 |
| adcxq %r10,%rax |
| movq 24(%rsi,%rcx,1),%rdx |
| leaq 32(%rcx),%rcx |
| movq 64(%rdi),%r10 |
| adoxq %r13,%r13 |
| adcxq %r11,%rbx |
| movq 72(%rdi),%r11 |
| movq %rax,32(%rdi) |
| movq %rbx,40(%rdi) |
| |
| mulxq %rdx,%rax,%rbx |
| adoxq %r10,%r10 |
| adcxq %r12,%rax |
| jrcxz .Lsqrx4x_shift_n_add_break |
| .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 |
| adoxq %r11,%r11 |
| adcxq %r13,%rbx |
| movq 80(%rdi),%r12 |
| movq 88(%rdi),%r13 |
| movq %rax,48(%rdi) |
| movq %rbx,56(%rdi) |
| leaq 64(%rdi),%rdi |
| nop |
| jmp .Lsqrx4x_shift_n_add |
| |
| .align 32 |
| .Lsqrx4x_shift_n_add_break: |
| adcxq %r13,%rbx |
| movq %rax,48(%rdi) |
| movq %rbx,56(%rdi) |
| leaq 64(%rdi),%rdi |
| .byte 102,72,15,126,213 |
| __bn_sqrx8x_reduction: |
| xorl %eax,%eax |
| movq 32+8(%rsp),%rbx |
| movq 48+8(%rsp),%rdx |
| leaq -64(%rbp,%r9,1),%rcx |
| |
| movq %rcx,0+8(%rsp) |
| movq %rdi,8+8(%rsp) |
| |
| leaq 48+8(%rsp),%rdi |
| jmp .Lsqrx8x_reduction_loop |
| |
| .align 32 |
| .Lsqrx8x_reduction_loop: |
| movq 8(%rdi),%r9 |
| movq 16(%rdi),%r10 |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r12 |
| movq %rdx,%r8 |
| imulq %rbx,%rdx |
| movq 40(%rdi),%r13 |
| movq 48(%rdi),%r14 |
| movq 56(%rdi),%r15 |
| movq %rax,24+8(%rsp) |
| |
| leaq 64(%rdi),%rdi |
| xorq %rsi,%rsi |
| movq $-8,%rcx |
| jmp .Lsqrx8x_reduce |
| |
| .align 32 |
| .Lsqrx8x_reduce: |
| movq %r8,%rbx |
| mulxq 0(%rbp),%rax,%r8 |
| adcxq %rbx,%rax |
| adoxq %r9,%r8 |
| |
| mulxq 8(%rbp),%rbx,%r9 |
| adcxq %rbx,%r8 |
| adoxq %r10,%r9 |
| |
| mulxq 16(%rbp),%rbx,%r10 |
| adcxq %rbx,%r9 |
| adoxq %r11,%r10 |
| |
| mulxq 24(%rbp),%rbx,%r11 |
| adcxq %rbx,%r10 |
| adoxq %r12,%r11 |
| |
| .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 |
| movq %rdx,%rax |
| movq %r8,%rdx |
| adcxq %rbx,%r11 |
| adoxq %r13,%r12 |
| |
| mulxq 32+8(%rsp),%rbx,%rdx |
| movq %rax,%rdx |
| movq %rax,64+48+8(%rsp,%rcx,8) |
| |
| mulxq 40(%rbp),%rax,%r13 |
| adcxq %rax,%r12 |
| adoxq %r14,%r13 |
| |
| mulxq 48(%rbp),%rax,%r14 |
| adcxq %rax,%r13 |
| adoxq %r15,%r14 |
| |
| mulxq 56(%rbp),%rax,%r15 |
| movq %rbx,%rdx |
| adcxq %rax,%r14 |
| adoxq %rsi,%r15 |
| adcxq %rsi,%r15 |
| |
| .byte 0x67,0x67,0x67 |
| incq %rcx |
| jnz .Lsqrx8x_reduce |
| |
| movq %rsi,%rax |
| cmpq 0+8(%rsp),%rbp |
| jae .Lsqrx8x_no_tail |
| |
| movq 48+8(%rsp),%rdx |
| addq 0(%rdi),%r8 |
| leaq 64(%rbp),%rbp |
| movq $-8,%rcx |
| adcxq 8(%rdi),%r9 |
| adcxq 16(%rdi),%r10 |
| adcq 24(%rdi),%r11 |
| adcq 32(%rdi),%r12 |
| adcq 40(%rdi),%r13 |
| adcq 48(%rdi),%r14 |
| adcq 56(%rdi),%r15 |
| leaq 64(%rdi),%rdi |
| sbbq %rax,%rax |
| |
| xorq %rsi,%rsi |
| movq %rax,16+8(%rsp) |
| jmp .Lsqrx8x_tail |
| |
| .align 32 |
| .Lsqrx8x_tail: |
| movq %r8,%rbx |
| mulxq 0(%rbp),%rax,%r8 |
| adcxq %rax,%rbx |
| adoxq %r9,%r8 |
| |
| mulxq 8(%rbp),%rax,%r9 |
| adcxq %rax,%r8 |
| adoxq %r10,%r9 |
| |
| mulxq 16(%rbp),%rax,%r10 |
| adcxq %rax,%r9 |
| adoxq %r11,%r10 |
| |
| mulxq 24(%rbp),%rax,%r11 |
| adcxq %rax,%r10 |
| adoxq %r12,%r11 |
| |
| .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 |
| adcxq %rax,%r11 |
| adoxq %r13,%r12 |
| |
| mulxq 40(%rbp),%rax,%r13 |
| adcxq %rax,%r12 |
| adoxq %r14,%r13 |
| |
| mulxq 48(%rbp),%rax,%r14 |
| adcxq %rax,%r13 |
| adoxq %r15,%r14 |
| |
| mulxq 56(%rbp),%rax,%r15 |
| movq 72+48+8(%rsp,%rcx,8),%rdx |
| adcxq %rax,%r14 |
| adoxq %rsi,%r15 |
| movq %rbx,(%rdi,%rcx,8) |
| movq %r8,%rbx |
| adcxq %rsi,%r15 |
| |
| incq %rcx |
| jnz .Lsqrx8x_tail |
| |
| cmpq 0+8(%rsp),%rbp |
| jae .Lsqrx8x_tail_done |
| |
| subq 16+8(%rsp),%rsi |
| movq 48+8(%rsp),%rdx |
| leaq 64(%rbp),%rbp |
| adcq 0(%rdi),%r8 |
| adcq 8(%rdi),%r9 |
| adcq 16(%rdi),%r10 |
| adcq 24(%rdi),%r11 |
| adcq 32(%rdi),%r12 |
| adcq 40(%rdi),%r13 |
| adcq 48(%rdi),%r14 |
| adcq 56(%rdi),%r15 |
| leaq 64(%rdi),%rdi |
| sbbq %rax,%rax |
| subq $8,%rcx |
| |
| xorq %rsi,%rsi |
| movq %rax,16+8(%rsp) |
| jmp .Lsqrx8x_tail |
| |
| .align 32 |
| .Lsqrx8x_tail_done: |
| xorq %rax,%rax |
| addq 24+8(%rsp),%r8 |
| adcq $0,%r9 |
| adcq $0,%r10 |
| adcq $0,%r11 |
| adcq $0,%r12 |
| adcq $0,%r13 |
| adcq $0,%r14 |
| adcq $0,%r15 |
| adcq $0,%rax |
| |
| subq 16+8(%rsp),%rsi |
| .Lsqrx8x_no_tail: |
| adcq 0(%rdi),%r8 |
| .byte 102,72,15,126,217 |
| adcq 8(%rdi),%r9 |
| movq 56(%rbp),%rsi |
| .byte 102,72,15,126,213 |
| adcq 16(%rdi),%r10 |
| adcq 24(%rdi),%r11 |
| adcq 32(%rdi),%r12 |
| adcq 40(%rdi),%r13 |
| adcq 48(%rdi),%r14 |
| adcq 56(%rdi),%r15 |
| adcq $0,%rax |
| |
| movq 32+8(%rsp),%rbx |
| movq 64(%rdi,%rcx,1),%rdx |
| |
| movq %r8,0(%rdi) |
| leaq 64(%rdi),%r8 |
| movq %r9,8(%rdi) |
| movq %r10,16(%rdi) |
| movq %r11,24(%rdi) |
| movq %r12,32(%rdi) |
| movq %r13,40(%rdi) |
| movq %r14,48(%rdi) |
| movq %r15,56(%rdi) |
| |
| leaq 64(%rdi,%rcx,1),%rdi |
| cmpq 8+8(%rsp),%r8 |
| jb .Lsqrx8x_reduction_loop |
| .byte 0xf3,0xc3 |
| .cfi_endproc |
| .size bn_sqrx8x_internal,.-bn_sqrx8x_internal |