| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s |
| ; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s |
| |
| ; Dynamically-sized allocation, needs a loop which can handle any size at |
| ; runtime. The final iteration of the loop will temporarily put SP below the |
| ; target address, but this doesn't break any of the ABI constraints on the |
| ; stack, and also doesn't probe below the target SP value. |
| define void @dynamic(i64 %size, ptr %out) #0 { |
| ; CHECK-LABEL: dynamic: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 16 |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: add x9, x0, #15 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB0_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB0_1 |
| ; CHECK-NEXT: .LBB0_3: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x1] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 16 |
| ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v = alloca i8, i64 %size, align 1 |
| store ptr %v, ptr %out, align 8 |
| ret void |
| } |
| |
| ; This function has a fixed-size stack slot and a dynamic one. The fixed size |
| ; slot isn't large enough that we would normally probe it, but we need to do so |
| ; here otherwise the gap between the CSR save and the first probe of the |
| ; dynamic allocation could be too far apart when the size of the dynamic |
| ; allocation is close to the guard size. |
| define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { |
| ; CHECK-LABEL: dynamic_fixed: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 16 |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: str xzr, [sp, #-64]! |
| ; CHECK-NEXT: add x9, x0, #15 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: sub x10, x29, #64 |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: str x10, [x1] |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB1_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB1_1 |
| ; CHECK-NEXT: .LBB1_3: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x2] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 16 |
| ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v1 = alloca i8, i64 64, align 1 |
| store ptr %v1, ptr %out1, align 8 |
| %v2 = alloca i8, i64 %size, align 1 |
| store ptr %v2, ptr %out2, align 8 |
| ret void |
| } |
| |
| ; Dynamic allocation, with an alignment requirement greater than the alignment |
| ; of SP. Done by ANDing the target SP with a constant to align it down, then |
| ; doing the loop as normal. Note that we also re-align the stack in the prolog, |
| ; which isn't actually needed because the only aligned allocations are dynamic, |
| ; this is done even without stack probing. |
| define void @dynamic_align_64(i64 %size, ptr %out) #0 { |
| ; CHECK-LABEL: dynamic_align_64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| ; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 32 |
| ; CHECK-NEXT: .cfi_offset w19, -16 |
| ; CHECK-NEXT: .cfi_offset w30, -24 |
| ; CHECK-NEXT: .cfi_offset w29, -32 |
| ; CHECK-NEXT: sub x9, sp, #32 |
| ; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 |
| ; CHECK-NEXT: add x9, x0, #15 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: mov x19, sp |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 |
| ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB2_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB2_1 |
| ; CHECK-NEXT: .LBB2_3: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x1] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 32 |
| ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w19 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v = alloca i8, i64 %size, align 64 |
| store ptr %v, ptr %out, align 8 |
| ret void |
| } |
| |
| ; Dynamic allocation, with an alignment greater than the stack guard size. The |
| ; only difference to the dynamic allocation is the constant used for aligning |
| ; the target SP, the loop will probe the whole allocation without needing to |
| ; know about the alignment padding. |
| define void @dynamic_align_8192(i64 %size, ptr %out) #0 { |
| ; CHECK-LABEL: dynamic_align_8192: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| ; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 32 |
| ; CHECK-NEXT: .cfi_offset w19, -16 |
| ; CHECK-NEXT: .cfi_offset w30, -24 |
| ; CHECK-NEXT: .cfi_offset w29, -32 |
| ; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: sub x9, x9, #4064 |
| ; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 |
| ; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x9 |
| ; CHECK-NEXT: b.le .LBB3_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB3_1 |
| ; CHECK-NEXT: .LBB3_3: |
| ; CHECK-NEXT: mov sp, x9 |
| ; CHECK-NEXT: add x9, x0, #15 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: mov x19, sp |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 |
| ; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB3_6 |
| ; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB3_4 |
| ; CHECK-NEXT: .LBB3_6: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x1] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 32 |
| ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w19 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v = alloca i8, i64 %size, align 8192 |
| store ptr %v, ptr %out, align 8 |
| ret void |
| } |
| |
| ; For 64k guard pages, the only difference is the constant subtracted from SP |
| ; in the loop. |
| define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { |
| ; CHECK-LABEL: dynamic_64k_guard: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 16 |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: add x9, x0, #15 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB4_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB4_1 |
| ; CHECK-NEXT: .LBB4_3: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x1] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 16 |
| ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v = alloca i8, i64 %size, align 1 |
| store ptr %v, ptr %out, align 8 |
| ret void |
| } |
| |
| ; If a function has variable-sized stack objects, then any function calls which |
| ; need to pass arguments on the stack must allocate the stack space for them |
| ; dynamically, to ensure they are at the bottom of the frame. We need to probe |
| ; that space when it is larger than the unprobed space allowed by the ABI (1024 |
| ; bytes), so this needs a very large number of arguments. |
| define void @no_reserved_call_frame(i64 %n) #0 { |
| ; CHECK-LABEL: no_reserved_call_frame: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 16 |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: lsl x9, x0, #2 |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: add x9, x9, #15 |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: sub x0, x8, x9 |
| ; CHECK-NEXT: .LBB5_1: // %entry |
| ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x0 |
| ; CHECK-NEXT: b.le .LBB5_3 |
| ; CHECK-NEXT: // %bb.2: // %entry |
| ; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB5_1 |
| ; CHECK-NEXT: .LBB5_3: // %entry |
| ; CHECK-NEXT: mov sp, x0 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: sub sp, sp, #1104 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: bl callee_stack_args |
| ; CHECK-NEXT: add sp, sp, #1104 |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 16 |
| ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| entry: |
| %v = alloca i32, i64 %n |
| call void @callee_stack_args(ptr %v, [138 x i64] undef) |
| ret void |
| } |
| |
| ; Same as above but without a variable-sized allocation, so the reserved call |
| ; frame can be folded into the fixed-size allocation in the prologue. |
| define void @reserved_call_frame(i64 %n) #0 { |
| ; CHECK-LABEL: reserved_call_frame: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| ; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 32 |
| ; CHECK-NEXT: .cfi_offset w28, -16 |
| ; CHECK-NEXT: .cfi_offset w30, -24 |
| ; CHECK-NEXT: .cfi_offset w29, -32 |
| ; CHECK-NEXT: sub sp, sp, #1504 |
| ; CHECK-NEXT: add x0, sp, #1104 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: bl callee_stack_args |
| ; CHECK-NEXT: add sp, sp, #1504 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 32 |
| ; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w28 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| entry: |
| %v = alloca i32, i64 100 |
| call void @callee_stack_args(ptr %v, [138 x i64] undef) |
| ret void |
| } |
| |
| declare void @callee_stack_args(ptr, [138 x i64]) |
| |
| ; Dynamic allocation of SVE vectors |
| define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { |
| ; CHECK-LABEL: dynamic_sve: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| ; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill |
| ; CHECK-NEXT: mov x29, sp |
| ; CHECK-NEXT: .cfi_def_cfa w29, 32 |
| ; CHECK-NEXT: .cfi_offset w19, -16 |
| ; CHECK-NEXT: .cfi_offset w30, -24 |
| ; CHECK-NEXT: .cfi_offset w29, -32 |
| ; CHECK-NEXT: rdvl x9, #1 |
| ; CHECK-NEXT: mov x10, #15 // =0xf |
| ; CHECK-NEXT: mov x8, sp |
| ; CHECK-NEXT: madd x9, x0, x9, x10 |
| ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 |
| ; CHECK-NEXT: sub x8, x8, x9 |
| ; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 |
| ; CHECK-NEXT: cmp sp, x8 |
| ; CHECK-NEXT: b.le .LBB7_3 |
| ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 |
| ; CHECK-NEXT: str xzr, [sp] |
| ; CHECK-NEXT: b .LBB7_1 |
| ; CHECK-NEXT: .LBB7_3: |
| ; CHECK-NEXT: mov sp, x8 |
| ; CHECK-NEXT: ldr xzr, [sp] |
| ; CHECK-NEXT: str x8, [x1] |
| ; CHECK-NEXT: mov sp, x29 |
| ; CHECK-NEXT: .cfi_def_cfa wsp, 32 |
| ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w19 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: ret |
| %v = alloca <vscale x 4 x float>, i64 %size, align 16 |
| store ptr %v, ptr %out, align 8 |
| ret void |
| } |
| |
| attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } |