| ## This test checks that BOLT correctly inlines memcpy calls on AArch64. |
| |
| # REQUIRES: system-linux, aarch64-registered-target |
| |
| # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o |
| # RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q |
| # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE |
| # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM |
| |
| # Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls) |
| # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls |
| |
| # Each function should use optimal size-specific instructions and NO memcpy calls |
| |
| # 1-byte copy should use single byte load/store (ldrb/strb) |
| # CHECK-ASM-LABEL: <test_1_byte_direct>: |
| # CHECK-ASM: ldrb{{.*}}w9, [x1] |
| # CHECK-ASM-NEXT: strb{{.*}}w9, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 2-byte copy should use single 16-bit load/store (ldrh/strh) |
| # CHECK-ASM-LABEL: <test_2_byte_direct>: |
| # CHECK-ASM: ldrh{{.*}}w9, [x1] |
| # CHECK-ASM-NEXT: strh{{.*}}w9, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 4-byte copy should use single 32-bit load/store (w register) |
| # CHECK-ASM-LABEL: <test_4_byte_direct>: |
| # CHECK-ASM: ldr{{.*}}w9, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}w9, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 8-byte copy should use single 64-bit load/store (x register) |
| # CHECK-ASM-LABEL: <test_8_byte_direct>: |
| # CHECK-ASM: ldr{{.*}}x9, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}x9, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 16-byte copy should use single 128-bit SIMD load/store (q register) |
| # CHECK-ASM-LABEL: <test_16_byte_direct>: |
| # CHECK-ASM: ldr{{.*}}q16, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 32-byte copy should use two 128-bit SIMD operations |
| # CHECK-ASM-LABEL: <test_32_byte_direct>: |
| # CHECK-ASM: ldr{{.*}}q16, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0] |
| # CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10] |
| # CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1) |
| # CHECK-ASM-LABEL: <test_37_byte_arbitrary>: |
| # CHECK-ASM: ldr{{.*}}q16, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0] |
| # CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] |
| # CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20] |
| # CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20] |
| # CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24] |
| # CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # 0-byte copy should be inlined with no load/store instructions (nothing to copy) |
| # CHECK-ASM-LABEL: <test_0_byte>: |
| # CHECK-ASM-NOT: ldr |
| # CHECK-ASM-NOT: str |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # Negative size should NOT be inlined (invalid size parameter) |
| # CHECK-ASM-LABEL: <test_negative_size>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # 128-byte copy should NOT be inlined (too large, original call preserved) |
| # CHECK-ASM-LABEL: <test_128_byte_too_large>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain) |
| # CHECK-ASM-LABEL: <test_4_byte_add_immediate>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # Register move should NOT be inlined (size unknown at compile time) |
| # CHECK-ASM-LABEL: <test_register_move_unknown>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # CHECK-ASM-LABEL: <test_x2_rewrite_unknown>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # Live-in parameter should NOT be inlined (size unknown at compile time) |
| # CHECK-ASM-LABEL: <test_live_in_unknown>: |
| # CHECK-ASM: bl{{.*}}<memcpy |
| |
| # _memcpy8 should be inlined with end-pointer return (dest+size) |
| # CHECK-ASM-LABEL: <test_memcpy8_4_byte>: |
| # CHECK-ASM: ldr{{.*}}w9, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}w9, [x0] |
| # CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4 |
| # CHECK-ASM-NOT: bl{{.*}}<_memcpy8 |
| |
| # Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register |
| # CHECK-ASM-LABEL: <complex_operation>: |
| # CHECK-ASM: ldr{{.*}}x9, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}x9, [x0] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| # Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register |
| # CHECK-ASM-LABEL: <complex_fp_operation>: |
| # CHECK-ASM: ldr{{.*}}q16, [x1] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0] |
| # CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] |
| # CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20] |
| # CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30] |
| # CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30] |
| # CHECK-ASM-NOT: bl{{.*}}<memcpy |
| |
| .text |
| .globl test_1_byte_direct |
| .type test_1_byte_direct,@function |
| test_1_byte_direct: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #1 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_1_byte_direct, .-test_1_byte_direct |
| |
| .globl test_2_byte_direct |
| .type test_2_byte_direct,@function |
| test_2_byte_direct: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #2 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_2_byte_direct, .-test_2_byte_direct |
| |
| .globl test_4_byte_direct |
| .type test_4_byte_direct,@function |
| test_4_byte_direct: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #4 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_4_byte_direct, .-test_4_byte_direct |
| |
| .globl test_8_byte_direct |
| .type test_8_byte_direct,@function |
| test_8_byte_direct: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #8 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_8_byte_direct, .-test_8_byte_direct |
| |
| .globl test_16_byte_direct |
| .type test_16_byte_direct,@function |
| test_16_byte_direct: |
| stp x29, x30, [sp, #-48]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #32 |
| mov x2, #16 |
| bl memcpy |
| ldp x29, x30, [sp], #48 |
| ret |
| .size test_16_byte_direct, .-test_16_byte_direct |
| |
| .globl test_32_byte_direct |
| .type test_32_byte_direct,@function |
| test_32_byte_direct: |
| stp x29, x30, [sp, #-80]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #48 |
| mov x2, #32 |
| bl memcpy |
| ldp x29, x30, [sp], #80 |
| ret |
| .size test_32_byte_direct, .-test_32_byte_direct |
| |
| .globl test_37_byte_arbitrary |
| .type test_37_byte_arbitrary,@function |
| test_37_byte_arbitrary: |
| stp x29, x30, [sp, #-96]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #56 |
| mov x2, #37 |
| bl memcpy |
| ldp x29, x30, [sp], #96 |
| ret |
| .size test_37_byte_arbitrary, .-test_37_byte_arbitrary |
| |
| .globl test_0_byte |
| .type test_0_byte,@function |
| test_0_byte: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #0 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_0_byte, .-test_0_byte |
| |
| .globl test_negative_size |
| .type test_negative_size,@function |
| test_negative_size: |
| # Negative size should not be inlined |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #-1 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_negative_size, .-test_negative_size |
| |
| .globl test_128_byte_too_large |
| .type test_128_byte_too_large,@function |
| test_128_byte_too_large: |
| stp x29, x30, [sp, #-288]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #152 |
| mov x2, #128 |
| bl memcpy |
| ldp x29, x30, [sp], #288 |
| ret |
| .size test_128_byte_too_large, .-test_128_byte_too_large |
| |
| .globl test_4_byte_add_immediate |
| .type test_4_byte_add_immediate,@function |
| test_4_byte_add_immediate: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x3, #0 |
| add x2, x3, #4 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_4_byte_add_immediate, .-test_4_byte_add_immediate |
| |
| .globl test_register_move_unknown |
| .type test_register_move_unknown,@function |
| test_register_move_unknown: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x6, #4 |
| mov x2, x6 |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_register_move_unknown, .-test_register_move_unknown |
| |
| .globl test_x2_rewrite_unknown |
| .type test_x2_rewrite_unknown,@function |
| test_x2_rewrite_unknown: |
| mov x2, #8 |
| ldr x2, [sp, #24] |
| bl memcpy |
| ret |
| .size test_x2_rewrite_unknown, .-test_x2_rewrite_unknown |
| |
| .globl test_live_in_unknown |
| .type test_live_in_unknown,@function |
| test_live_in_unknown: |
| # x2 comes in as parameter, no instruction sets it (should NOT inline) |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| # x2 is live-in, no size-setting instruction |
| bl memcpy |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_live_in_unknown, .-test_live_in_unknown |
| |
| .globl test_memcpy8_4_byte |
| .type test_memcpy8_4_byte,@function |
| test_memcpy8_4_byte: |
| stp x29, x30, [sp, #-32]! |
| mov x29, sp |
| add x1, sp, #16 |
| add x0, sp, #8 |
| mov x2, #4 |
| bl _memcpy8 |
| ldp x29, x30, [sp], #32 |
| ret |
| .size test_memcpy8_4_byte, .-test_memcpy8_4_byte |
| |
| # Simple _memcpy8 implementation that calls memcpy and returns dest+size |
| .globl _memcpy8 |
| .type _memcpy8,@function |
| _memcpy8: |
| stp x29, x30, [sp, #-16]! |
| mov x29, sp |
| mov x3, x0 |
| bl memcpy |
| add x0, x3, x2 |
| ldp x29, x30, [sp], #16 |
| ret |
| .size _memcpy8, .-_memcpy8 |
| |
| .globl complex_operation |
| .type complex_operation,@function |
| complex_operation: |
| stp x29, x30, [sp, #-32]! |
| str x19, [sp, #16] |
| mov x29, sp |
| ldp x9, x10, [x0] |
| ldp x11, x12, [x0, #16] |
| mov x19, x1 |
| mov x8, x0 |
| add x0, x1, #32 |
| madd x9, x9, x2, x3 |
| and x10, x10, x4 |
| asr x12, x12, #2 |
| mov w2, #8 |
| orr x11, x12, x11, lsl #3 |
| eor x12, x9, x10 |
| mul x10, x11, x10 |
| eor x12, x12, x11 |
| add x13, x12, x9 |
| add x9, x11, x9, asr #4 |
| stp x13, x10, [x1] |
| mov w10, w12 |
| stp x9, x10, [x1, #16] |
| add x1, x8, #32 |
| bl memcpy |
| ldr x0, [x19, #16] |
| ldr x19, [sp, #16] |
| ldp x29, x30, [sp], #32 |
| b use |
| .size complex_operation, .-complex_operation |
| |
| .globl use |
| .type use,@function |
| use: |
| ret |
| .size use, .-use |
| |
| # Same as above but using FP caller-saved registers (Q16/17) |
| .globl complex_fp_operation |
| .type complex_fp_operation,@function |
| complex_fp_operation: |
| stp x29, x30, [sp, #-48]! |
| stp q8, q9, [sp, #16] |
| mov x29, sp |
| ldr q16, [x0] |
| ldr q17, [x0, #16] |
| mov x8, x0 |
| add x0, x1, #32 |
| fadd v16.4s, v16.4s, v17.4s |
| fmul v17.4s, v16.4s, v17.4s |
| fsub v16.2d, v16.2d, v17.2d |
| mov w2, #64 |
| fmax v17.4s, v16.4s, v17.4s |
| fmin v16.2d, v16.2d, v17.2d |
| str q16, [x1] |
| str q17, [x1, #16] |
| add x1, x8, #32 |
| bl memcpy |
| ldp q8, q9, [sp, #16] |
| ldp x29, x30, [sp], #48 |
| b use_fp |
| .size complex_fp_operation, .-complex_fp_operation |
| |
| .globl use_fp |
| .type use_fp,@function |
| use_fp: |
| ret |
| .size use_fp, .-use_fp |