blob: bf026d9ef8642b312e4386205c3a87852d69c75e [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=arm64-apple-macosx -mcpu=apple-m4 < %s | FileCheck %s
; Test that FPR copies in functions with streaming mode changes
; use SVE/scalar instructions instead of NEON to avoid illegal instructions
; in streaming regions. For 32/64b FPR cases Apple zero cycle moves can also
; trigger this issue.
declare void @streaming_callee(ptr, ptr, <2 x double>, <2 x double>, <2 x double>, <2 x double>) "aarch64_pstate_sm_enabled"
declare void @normal_callee(ptr, ptr, <2 x double>, <2 x double>, <2 x double>, <2 x double>)
define void @caller(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %S) "target-features"="+sme2" {
; CHECK-LABEL: caller:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #128
; CHECK-NEXT: .cfi_def_cfa_offset 128
; CHECK-NEXT: stp d15, d14, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #80] ; 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #96] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #112] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
; CHECK-NEXT: .cfi_offset b10, -40
; CHECK-NEXT: .cfi_offset b11, -48
; CHECK-NEXT: .cfi_offset b12, -56
; CHECK-NEXT: .cfi_offset b13, -64
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: stp q1, q0, [sp, #16] ; 32-byte Folded Spill
; CHECK-NEXT: fneg.2d v0, v1
; CHECK-NEXT: str q0, [sp] ; 16-byte Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp q2, q0, [sp, #16] ; 32-byte Folded Reload
; CHECK-NEXT: ldr q1, [sp] ; 16-byte Reload
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: bl _streaming_callee
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldp x29, x30, [sp, #112] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #96] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #80] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
; CHECK-NEXT: .cfi_restore b11
; CHECK-NEXT: .cfi_restore b12
; CHECK-NEXT: .cfi_restore b13
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
%negS = fneg <2 x double> %S
call void @streaming_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C)
ret void
}
declare void @streaming_callee_d(i64, i64, double, double, double, double) "aarch64_pstate_sm_enabled"
define void @fpr64_copy(i64 %n, i64 %m, double %c, double %s) "target-features"="+sme2" {
; CHECK-LABEL: fpr64_copy:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: .cfi_def_cfa_offset 112
; CHECK-NEXT: stp d15, d14, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #96] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
; CHECK-NEXT: .cfi_offset b10, -40
; CHECK-NEXT: .cfi_offset b11, -48
; CHECK-NEXT: .cfi_offset b12, -56
; CHECK-NEXT: .cfi_offset b13, -64
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: stp d1, d0, [sp, #16] ; 16-byte Folded Spill
; CHECK-NEXT: fneg d0, d1
; CHECK-NEXT: str d0, [sp, #8] ; 8-byte Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp d2, d0, [sp, #16] ; 16-byte Folded Reload
; CHECK-NEXT: ldr d1, [sp, #8] ; 8-byte Reload
; CHECK-NEXT: fmov d3, d0
; CHECK-NEXT: bl _streaming_callee_d
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldp x29, x30, [sp, #96] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #80] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
; CHECK-NEXT: .cfi_restore b11
; CHECK-NEXT: .cfi_restore b12
; CHECK-NEXT: .cfi_restore b13
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
%negs = fneg double %s
call void @streaming_callee_d(i64 %n, i64 %m, double %c, double %negs, double %s, double %c)
ret void
}
declare void @streaming_callee_f(i64, i64, float, float, float, float) "aarch64_pstate_sm_enabled"
define void @fpr32_copy(i64 %n, i64 %m, float %c, float %s) "target-features"="+sme2" {
; CHECK-LABEL: fpr32_copy:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 96
; CHECK-NEXT: stp d15, d14, [sp, #16] ; 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
; CHECK-NEXT: .cfi_offset b10, -40
; CHECK-NEXT: .cfi_offset b11, -48
; CHECK-NEXT: .cfi_offset b12, -56
; CHECK-NEXT: .cfi_offset b13, -64
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: stp s1, s0, [sp, #8] ; 8-byte Folded Spill
; CHECK-NEXT: fneg s0, s1
; CHECK-NEXT: str s0, [sp, #4] ; 4-byte Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp s2, s0, [sp, #8] ; 8-byte Folded Reload
; CHECK-NEXT: ldr s1, [sp, #4] ; 4-byte Reload
; CHECK-NEXT: fmov s3, s0
; CHECK-NEXT: bl _streaming_callee_f
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
; CHECK-NEXT: .cfi_restore b11
; CHECK-NEXT: .cfi_restore b12
; CHECK-NEXT: .cfi_restore b13
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
%negs = fneg float %s
call void @streaming_callee_f(i64 %n, i64 %m, float %c, float %negs, float %s, float %c)
ret void
}
declare void @streaming_callee_h(i64, i64, half, half, half, half) "aarch64_pstate_sm_enabled"
define void @fpr16_copy(i64 %n, i64 %m, half %c, half %s) "target-features"="+sme2" {
; CHECK-LABEL: fpr16_copy:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 96
; CHECK-NEXT: stp d15, d14, [sp, #16] ; 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
; CHECK-NEXT: .cfi_offset b10, -40
; CHECK-NEXT: .cfi_offset b11, -48
; CHECK-NEXT: .cfi_offset b12, -56
; CHECK-NEXT: .cfi_offset b13, -64
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: str h1, [sp, #12] ; 2-byte Spill
; CHECK-NEXT: str h0, [sp, #14] ; 2-byte Spill
; CHECK-NEXT: fneg h0, h1
; CHECK-NEXT: str h0, [sp, #10] ; 2-byte Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr h0, [sp, #14] ; 2-byte Reload
; CHECK-NEXT: ldr h1, [sp, #10] ; 2-byte Reload
; CHECK-NEXT: ldr h2, [sp, #12] ; 2-byte Reload
; CHECK-NEXT: fmov s3, s0
; CHECK-NEXT: bl _streaming_callee_h
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
; CHECK-NEXT: .cfi_restore b11
; CHECK-NEXT: .cfi_restore b12
; CHECK-NEXT: .cfi_restore b13
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
%negs = fneg half %s
call void @streaming_callee_h(i64 %n, i64 %m, half %c, half %negs, half %s, half %c)
ret void
}
; Test mixed calls: normal -> streaming -> normal
define void @mixed_calls(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %S) "target-features"="+sme2" {
; CHECK-LABEL: mixed_calls:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #192
; CHECK-NEXT: .cfi_def_cfa_offset 192
; CHECK-NEXT: stp d15, d14, [sp, #96] ; 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #112] ; 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #128] ; 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #144] ; 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #160] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #176] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset w19, -24
; CHECK-NEXT: .cfi_offset w20, -32
; CHECK-NEXT: .cfi_offset b8, -40
; CHECK-NEXT: .cfi_offset b9, -48
; CHECK-NEXT: .cfi_offset b10, -56
; CHECK-NEXT: .cfi_offset b11, -64
; CHECK-NEXT: .cfi_offset b12, -72
; CHECK-NEXT: .cfi_offset b13, -80
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: mov.16b v2, v1
; CHECK-NEXT: str q1, [sp, #80] ; 16-byte Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: fneg.2d v1, v1
; CHECK-NEXT: stp q0, q1, [sp, #48] ; 32-byte Folded Spill
; CHECK-NEXT: mov.16b v3, v0
; CHECK-NEXT: bl _normal_callee
; CHECK-NEXT: ldr q4, [sp, #48] ; 16-byte Reload
; CHECK-NEXT: ldp q0, q5, [sp, #64] ; 32-byte Folded Reload
; CHECK-NEXT: stp q4, q5, [sp, #16] ; 32-byte Folded Spill
; CHECK-NEXT: str q0, [sp] ; 16-byte Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: mov x1, x19
; CHECK-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload
; CHECK-NEXT: ldr q2, [sp, #32] ; 16-byte Reload
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: bl _streaming_callee
; CHECK-NEXT: smstop sm
; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: mov x1, x19
; CHECK-NEXT: ldp q0, q1, [sp, #48] ; 32-byte Folded Reload
; CHECK-NEXT: ldr q2, [sp, #80] ; 16-byte Reload
; CHECK-NEXT: mov.16b v3, v0
; CHECK-NEXT: bl _normal_callee
; CHECK-NEXT: ldp x29, x30, [sp, #176] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x20, x19, [sp, #160] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #144] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #128] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #112] ; 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #96] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #192
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w30
; CHECK-NEXT: .cfi_restore w29
; CHECK-NEXT: .cfi_restore w19
; CHECK-NEXT: .cfi_restore w20
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
; CHECK-NEXT: .cfi_restore b11
; CHECK-NEXT: .cfi_restore b12
; CHECK-NEXT: .cfi_restore b13
; CHECK-NEXT: .cfi_restore b14
; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
%negS = fneg <2 x double> %S
; First call - normal function
call void @normal_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C)
; Second call - streaming function (requires smstart/smstop)
call void @streaming_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C)
; Third call - normal function again
call void @normal_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C)
ret void
}