blob: 75066c855b9ed1c493c9b204f5e042ba94e5c2ac [file] [log] [blame] [edit]
## This test checks that BOLT correctly inlines memcpy calls on AArch64.
# REQUIRES: system-linux, aarch64-registered-target
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls)
# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
# 1-byte copy should use single byte load/store (ldrb/strb)
# CHECK-ASM-LABEL: <test_1_byte_direct>:
# CHECK-ASM: ldrb{{.*}}w9, [x1]
# CHECK-ASM-NEXT: strb{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 2-byte copy should use single 16-bit load/store (ldrh/strh)
# CHECK-ASM-LABEL: <test_2_byte_direct>:
# CHECK-ASM: ldrh{{.*}}w9, [x1]
# CHECK-ASM-NEXT: strh{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 4-byte copy should use single 32-bit load/store (w register)
# CHECK-ASM-LABEL: <test_4_byte_direct>:
# CHECK-ASM: ldr{{.*}}w9, [x1]
# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 8-byte copy should use single 64-bit load/store (x register)
# CHECK-ASM-LABEL: <test_8_byte_direct>:
# CHECK-ASM: ldr{{.*}}x9, [x1]
# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 16-byte copy should use single 128-bit SIMD load/store (q register)
# CHECK-ASM-LABEL: <test_16_byte_direct>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 32-byte copy should use two 128-bit SIMD operations
# CHECK-ASM-LABEL: <test_32_byte_direct>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10]
# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20]
# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20]
# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24]
# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
# CHECK-ASM-LABEL: <test_0_byte>:
# CHECK-ASM-NOT: ldr
# CHECK-ASM-NOT: str
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# Negative size should NOT be inlined (invalid size parameter)
# CHECK-ASM-LABEL: <test_negative_size>:
# CHECK-ASM: bl{{.*}}<memcpy
# 128-byte copy should NOT be inlined (too large, original call preserved)
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
# CHECK-ASM: bl{{.*}}<memcpy
# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
# CHECK-ASM: bl{{.*}}<memcpy
# Register move should NOT be inlined (size unknown at compile time)
# CHECK-ASM-LABEL: <test_register_move_unknown>:
# CHECK-ASM: bl{{.*}}<memcpy
# CHECK-ASM-LABEL: <test_x2_rewrite_unknown>:
# CHECK-ASM: bl{{.*}}<memcpy
# Live-in parameter should NOT be inlined (size unknown at compile time)
# CHECK-ASM-LABEL: <test_live_in_unknown>:
# CHECK-ASM: bl{{.*}}<memcpy
# _memcpy8 should be inlined with end-pointer return (dest+size)
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
# CHECK-ASM: ldr{{.*}}w9, [x1]
# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
# CHECK-ASM-LABEL: <complex_operation>:
# CHECK-ASM: ldr{{.*}}x9, [x1]
# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
# CHECK-ASM-LABEL: <complex_fp_operation>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20]
# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30]
# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
.text
.globl test_1_byte_direct
.type test_1_byte_direct,@function
test_1_byte_direct:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #1
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_1_byte_direct, .-test_1_byte_direct
.globl test_2_byte_direct
.type test_2_byte_direct,@function
test_2_byte_direct:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #2
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_2_byte_direct, .-test_2_byte_direct
.globl test_4_byte_direct
.type test_4_byte_direct,@function
test_4_byte_direct:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #4
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_4_byte_direct, .-test_4_byte_direct
.globl test_8_byte_direct
.type test_8_byte_direct,@function
test_8_byte_direct:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #8
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_8_byte_direct, .-test_8_byte_direct
.globl test_16_byte_direct
.type test_16_byte_direct,@function
test_16_byte_direct:
stp x29, x30, [sp, #-48]!
mov x29, sp
add x1, sp, #16
add x0, sp, #32
mov x2, #16
bl memcpy
ldp x29, x30, [sp], #48
ret
.size test_16_byte_direct, .-test_16_byte_direct
.globl test_32_byte_direct
.type test_32_byte_direct,@function
test_32_byte_direct:
stp x29, x30, [sp, #-80]!
mov x29, sp
add x1, sp, #16
add x0, sp, #48
mov x2, #32
bl memcpy
ldp x29, x30, [sp], #80
ret
.size test_32_byte_direct, .-test_32_byte_direct
.globl test_37_byte_arbitrary
.type test_37_byte_arbitrary,@function
test_37_byte_arbitrary:
stp x29, x30, [sp, #-96]!
mov x29, sp
add x1, sp, #16
add x0, sp, #56
mov x2, #37
bl memcpy
ldp x29, x30, [sp], #96
ret
.size test_37_byte_arbitrary, .-test_37_byte_arbitrary
.globl test_0_byte
.type test_0_byte,@function
test_0_byte:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #0
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_0_byte, .-test_0_byte
.globl test_negative_size
.type test_negative_size,@function
test_negative_size:
# Negative size should not be inlined
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #-1
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_negative_size, .-test_negative_size
.globl test_128_byte_too_large
.type test_128_byte_too_large,@function
test_128_byte_too_large:
stp x29, x30, [sp, #-288]!
mov x29, sp
add x1, sp, #16
add x0, sp, #152
mov x2, #128
bl memcpy
ldp x29, x30, [sp], #288
ret
.size test_128_byte_too_large, .-test_128_byte_too_large
.globl test_4_byte_add_immediate
.type test_4_byte_add_immediate,@function
test_4_byte_add_immediate:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x3, #0
add x2, x3, #4
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_4_byte_add_immediate, .-test_4_byte_add_immediate
.globl test_register_move_unknown
.type test_register_move_unknown,@function
test_register_move_unknown:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x6, #4
mov x2, x6
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_register_move_unknown, .-test_register_move_unknown
.globl test_x2_rewrite_unknown
.type test_x2_rewrite_unknown,@function
test_x2_rewrite_unknown:
mov x2, #8
ldr x2, [sp, #24]
bl memcpy
ret
.size test_x2_rewrite_unknown, .-test_x2_rewrite_unknown
.globl test_live_in_unknown
.type test_live_in_unknown,@function
test_live_in_unknown:
# x2 comes in as parameter, no instruction sets it (should NOT inline)
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
# x2 is live-in, no size-setting instruction
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_live_in_unknown, .-test_live_in_unknown
.globl test_memcpy8_4_byte
.type test_memcpy8_4_byte,@function
test_memcpy8_4_byte:
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
mov x2, #4
bl _memcpy8
ldp x29, x30, [sp], #32
ret
.size test_memcpy8_4_byte, .-test_memcpy8_4_byte
# Simple _memcpy8 implementation that calls memcpy and returns dest+size
.globl _memcpy8
.type _memcpy8,@function
_memcpy8:
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x3, x0
bl memcpy
add x0, x3, x2
ldp x29, x30, [sp], #16
ret
.size _memcpy8, .-_memcpy8
.globl complex_operation
.type complex_operation,@function
complex_operation:
stp x29, x30, [sp, #-32]!
str x19, [sp, #16]
mov x29, sp
ldp x9, x10, [x0]
ldp x11, x12, [x0, #16]
mov x19, x1
mov x8, x0
add x0, x1, #32
madd x9, x9, x2, x3
and x10, x10, x4
asr x12, x12, #2
mov w2, #8
orr x11, x12, x11, lsl #3
eor x12, x9, x10
mul x10, x11, x10
eor x12, x12, x11
add x13, x12, x9
add x9, x11, x9, asr #4
stp x13, x10, [x1]
mov w10, w12
stp x9, x10, [x1, #16]
add x1, x8, #32
bl memcpy
ldr x0, [x19, #16]
ldr x19, [sp, #16]
ldp x29, x30, [sp], #32
b use
.size complex_operation, .-complex_operation
.globl use
.type use,@function
use:
ret
.size use, .-use
# Same as above but using FP caller-saved registers (Q16/17)
.globl complex_fp_operation
.type complex_fp_operation,@function
complex_fp_operation:
stp x29, x30, [sp, #-48]!
stp q8, q9, [sp, #16]
mov x29, sp
ldr q16, [x0]
ldr q17, [x0, #16]
mov x8, x0
add x0, x1, #32
fadd v16.4s, v16.4s, v17.4s
fmul v17.4s, v16.4s, v17.4s
fsub v16.2d, v16.2d, v17.2d
mov w2, #64
fmax v17.4s, v16.4s, v17.4s
fmin v16.2d, v16.2d, v17.2d
str q16, [x1]
str q17, [x1, #16]
add x1, x8, #32
bl memcpy
ldp q8, q9, [sp, #16]
ldp x29, x30, [sp], #48
b use_fp
.size complex_fp_operation, .-complex_fp_operation
.globl use_fp
.type use_fp,@function
use_fp:
ret
.size use_fp, .-use_fp