| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=arm64-apple-macosx -mcpu=apple-m4 < %s | FileCheck %s |
| |
| ; Test that FPR copies in functions with streaming mode changes |
| ; use SVE/scalar instructions instead of NEON to avoid illegal instructions |
| ; in streaming regions. For 32/64b FPR cases Apple zero cycle moves can also |
| ; trigger this issue. |
| |
| declare void @streaming_callee(ptr, ptr, <2 x double>, <2 x double>, <2 x double>, <2 x double>) "aarch64_pstate_sm_enabled" |
| declare void @normal_callee(ptr, ptr, <2 x double>, <2 x double>, <2 x double>, <2 x double>) |
| |
| define void @caller(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %S) "target-features"="+sme2" { |
| ; CHECK-LABEL: caller: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: sub sp, sp, #128 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 128 |
| ; CHECK-NEXT: stp d15, d14, [sp, #48] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #64] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #80] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #96] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #112] ; 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: .cfi_offset b8, -24 |
| ; CHECK-NEXT: .cfi_offset b9, -32 |
| ; CHECK-NEXT: .cfi_offset b10, -40 |
| ; CHECK-NEXT: .cfi_offset b11, -48 |
| ; CHECK-NEXT: .cfi_offset b12, -56 |
| ; CHECK-NEXT: .cfi_offset b13, -64 |
| ; CHECK-NEXT: .cfi_offset b14, -72 |
| ; CHECK-NEXT: .cfi_offset b15, -80 |
| ; CHECK-NEXT: stp q1, q0, [sp, #16] ; 32-byte Folded Spill |
| ; CHECK-NEXT: fneg.2d v0, v1 |
| ; CHECK-NEXT: str q0, [sp] ; 16-byte Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldp q2, q0, [sp, #16] ; 32-byte Folded Reload |
| ; CHECK-NEXT: ldr q1, [sp] ; 16-byte Reload |
| ; CHECK-NEXT: mov z3.d, z0.d |
| ; CHECK-NEXT: bl _streaming_callee |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp x29, x30, [sp, #112] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #96] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #80] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #64] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #48] ; 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #128 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: .cfi_restore b8 |
| ; CHECK-NEXT: .cfi_restore b9 |
| ; CHECK-NEXT: .cfi_restore b10 |
| ; CHECK-NEXT: .cfi_restore b11 |
| ; CHECK-NEXT: .cfi_restore b12 |
| ; CHECK-NEXT: .cfi_restore b13 |
| ; CHECK-NEXT: .cfi_restore b14 |
| ; CHECK-NEXT: .cfi_restore b15 |
| ; CHECK-NEXT: ret |
| entry: |
| %negS = fneg <2 x double> %S |
| call void @streaming_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C) |
| ret void |
| } |
| |
| declare void @streaming_callee_d(i64, i64, double, double, double, double) "aarch64_pstate_sm_enabled" |
| |
| define void @fpr64_copy(i64 %n, i64 %m, double %c, double %s) "target-features"="+sme2" { |
| ; CHECK-LABEL: fpr64_copy: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: sub sp, sp, #112 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 112 |
| ; CHECK-NEXT: stp d15, d14, [sp, #32] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #48] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #64] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #80] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #96] ; 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: .cfi_offset b8, -24 |
| ; CHECK-NEXT: .cfi_offset b9, -32 |
| ; CHECK-NEXT: .cfi_offset b10, -40 |
| ; CHECK-NEXT: .cfi_offset b11, -48 |
| ; CHECK-NEXT: .cfi_offset b12, -56 |
| ; CHECK-NEXT: .cfi_offset b13, -64 |
| ; CHECK-NEXT: .cfi_offset b14, -72 |
| ; CHECK-NEXT: .cfi_offset b15, -80 |
| ; CHECK-NEXT: stp d1, d0, [sp, #16] ; 16-byte Folded Spill |
| ; CHECK-NEXT: fneg d0, d1 |
| ; CHECK-NEXT: str d0, [sp, #8] ; 8-byte Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldp d2, d0, [sp, #16] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldr d1, [sp, #8] ; 8-byte Reload |
| ; CHECK-NEXT: fmov d3, d0 |
| ; CHECK-NEXT: bl _streaming_callee_d |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp x29, x30, [sp, #96] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #80] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #64] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #48] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #32] ; 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #112 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: .cfi_restore b8 |
| ; CHECK-NEXT: .cfi_restore b9 |
| ; CHECK-NEXT: .cfi_restore b10 |
| ; CHECK-NEXT: .cfi_restore b11 |
| ; CHECK-NEXT: .cfi_restore b12 |
| ; CHECK-NEXT: .cfi_restore b13 |
| ; CHECK-NEXT: .cfi_restore b14 |
| ; CHECK-NEXT: .cfi_restore b15 |
| ; CHECK-NEXT: ret |
| entry: |
| %negs = fneg double %s |
| call void @streaming_callee_d(i64 %n, i64 %m, double %c, double %negs, double %s, double %c) |
| ret void |
| } |
| |
| declare void @streaming_callee_f(i64, i64, float, float, float, float) "aarch64_pstate_sm_enabled" |
| |
| define void @fpr32_copy(i64 %n, i64 %m, float %c, float %s) "target-features"="+sme2" { |
| ; CHECK-LABEL: fpr32_copy: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: sub sp, sp, #96 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 96 |
| ; CHECK-NEXT: stp d15, d14, [sp, #16] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #32] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #48] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #64] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: .cfi_offset b8, -24 |
| ; CHECK-NEXT: .cfi_offset b9, -32 |
| ; CHECK-NEXT: .cfi_offset b10, -40 |
| ; CHECK-NEXT: .cfi_offset b11, -48 |
| ; CHECK-NEXT: .cfi_offset b12, -56 |
| ; CHECK-NEXT: .cfi_offset b13, -64 |
| ; CHECK-NEXT: .cfi_offset b14, -72 |
| ; CHECK-NEXT: .cfi_offset b15, -80 |
| ; CHECK-NEXT: stp s1, s0, [sp, #8] ; 8-byte Folded Spill |
| ; CHECK-NEXT: fneg s0, s1 |
| ; CHECK-NEXT: str s0, [sp, #4] ; 4-byte Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldp s2, s0, [sp, #8] ; 8-byte Folded Reload |
| ; CHECK-NEXT: ldr s1, [sp, #4] ; 4-byte Reload |
| ; CHECK-NEXT: fmov s3, s0 |
| ; CHECK-NEXT: bl _streaming_callee_f |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #64] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #48] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #32] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #16] ; 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #96 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: .cfi_restore b8 |
| ; CHECK-NEXT: .cfi_restore b9 |
| ; CHECK-NEXT: .cfi_restore b10 |
| ; CHECK-NEXT: .cfi_restore b11 |
| ; CHECK-NEXT: .cfi_restore b12 |
| ; CHECK-NEXT: .cfi_restore b13 |
| ; CHECK-NEXT: .cfi_restore b14 |
| ; CHECK-NEXT: .cfi_restore b15 |
| ; CHECK-NEXT: ret |
| entry: |
| %negs = fneg float %s |
| call void @streaming_callee_f(i64 %n, i64 %m, float %c, float %negs, float %s, float %c) |
| ret void |
| } |
| |
| declare void @streaming_callee_h(i64, i64, half, half, half, half) "aarch64_pstate_sm_enabled" |
| |
| define void @fpr16_copy(i64 %n, i64 %m, half %c, half %s) "target-features"="+sme2" { |
| ; CHECK-LABEL: fpr16_copy: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: sub sp, sp, #96 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 96 |
| ; CHECK-NEXT: stp d15, d14, [sp, #16] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #32] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #48] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #64] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: .cfi_offset b8, -24 |
| ; CHECK-NEXT: .cfi_offset b9, -32 |
| ; CHECK-NEXT: .cfi_offset b10, -40 |
| ; CHECK-NEXT: .cfi_offset b11, -48 |
| ; CHECK-NEXT: .cfi_offset b12, -56 |
| ; CHECK-NEXT: .cfi_offset b13, -64 |
| ; CHECK-NEXT: .cfi_offset b14, -72 |
| ; CHECK-NEXT: .cfi_offset b15, -80 |
| ; CHECK-NEXT: str h1, [sp, #12] ; 2-byte Spill |
| ; CHECK-NEXT: str h0, [sp, #14] ; 2-byte Spill |
| ; CHECK-NEXT: fneg h0, h1 |
| ; CHECK-NEXT: str h0, [sp, #10] ; 2-byte Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldr h0, [sp, #14] ; 2-byte Reload |
| ; CHECK-NEXT: ldr h1, [sp, #10] ; 2-byte Reload |
| ; CHECK-NEXT: ldr h2, [sp, #12] ; 2-byte Reload |
| ; CHECK-NEXT: fmov s3, s0 |
| ; CHECK-NEXT: bl _streaming_callee_h |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #64] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #48] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #32] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #16] ; 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #96 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: .cfi_restore b8 |
| ; CHECK-NEXT: .cfi_restore b9 |
| ; CHECK-NEXT: .cfi_restore b10 |
| ; CHECK-NEXT: .cfi_restore b11 |
| ; CHECK-NEXT: .cfi_restore b12 |
| ; CHECK-NEXT: .cfi_restore b13 |
| ; CHECK-NEXT: .cfi_restore b14 |
| ; CHECK-NEXT: .cfi_restore b15 |
| ; CHECK-NEXT: ret |
| entry: |
| %negs = fneg half %s |
| call void @streaming_callee_h(i64 %n, i64 %m, half %c, half %negs, half %s, half %c) |
| ret void |
| } |
| |
| ; Test mixed calls: normal -> streaming -> normal |
| define void @mixed_calls(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %S) "target-features"="+sme2" { |
| ; CHECK-LABEL: mixed_calls: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: sub sp, sp, #192 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 192 |
| ; CHECK-NEXT: stp d15, d14, [sp, #96] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #112] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #128] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #144] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x20, x19, [sp, #160] ; 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #176] ; 16-byte Folded Spill |
| ; CHECK-NEXT: .cfi_offset w30, -8 |
| ; CHECK-NEXT: .cfi_offset w29, -16 |
| ; CHECK-NEXT: .cfi_offset w19, -24 |
| ; CHECK-NEXT: .cfi_offset w20, -32 |
| ; CHECK-NEXT: .cfi_offset b8, -40 |
| ; CHECK-NEXT: .cfi_offset b9, -48 |
| ; CHECK-NEXT: .cfi_offset b10, -56 |
| ; CHECK-NEXT: .cfi_offset b11, -64 |
| ; CHECK-NEXT: .cfi_offset b12, -72 |
| ; CHECK-NEXT: .cfi_offset b13, -80 |
| ; CHECK-NEXT: .cfi_offset b14, -88 |
| ; CHECK-NEXT: .cfi_offset b15, -96 |
| ; CHECK-NEXT: mov.16b v2, v1 |
| ; CHECK-NEXT: str q1, [sp, #80] ; 16-byte Spill |
| ; CHECK-NEXT: mov x19, x1 |
| ; CHECK-NEXT: mov x20, x0 |
| ; CHECK-NEXT: fneg.2d v1, v1 |
| ; CHECK-NEXT: stp q0, q1, [sp, #48] ; 32-byte Folded Spill |
| ; CHECK-NEXT: mov.16b v3, v0 |
| ; CHECK-NEXT: bl _normal_callee |
| ; CHECK-NEXT: ldr q4, [sp, #48] ; 16-byte Reload |
| ; CHECK-NEXT: ldp q0, q5, [sp, #64] ; 32-byte Folded Reload |
| ; CHECK-NEXT: stp q4, q5, [sp, #16] ; 32-byte Folded Spill |
| ; CHECK-NEXT: str q0, [sp] ; 16-byte Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: mov x0, x20 |
| ; CHECK-NEXT: mov x1, x19 |
| ; CHECK-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload |
| ; CHECK-NEXT: ldr q2, [sp, #32] ; 16-byte Reload |
| ; CHECK-NEXT: mov z3.d, z0.d |
| ; CHECK-NEXT: bl _streaming_callee |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: mov x0, x20 |
| ; CHECK-NEXT: mov x1, x19 |
| ; CHECK-NEXT: ldp q0, q1, [sp, #48] ; 32-byte Folded Reload |
| ; CHECK-NEXT: ldr q2, [sp, #80] ; 16-byte Reload |
| ; CHECK-NEXT: mov.16b v3, v0 |
| ; CHECK-NEXT: bl _normal_callee |
| ; CHECK-NEXT: ldp x29, x30, [sp, #176] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp x20, x19, [sp, #160] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #144] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #128] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #112] ; 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #96] ; 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #192 |
| ; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| ; CHECK-NEXT: .cfi_restore w30 |
| ; CHECK-NEXT: .cfi_restore w29 |
| ; CHECK-NEXT: .cfi_restore w19 |
| ; CHECK-NEXT: .cfi_restore w20 |
| ; CHECK-NEXT: .cfi_restore b8 |
| ; CHECK-NEXT: .cfi_restore b9 |
| ; CHECK-NEXT: .cfi_restore b10 |
| ; CHECK-NEXT: .cfi_restore b11 |
| ; CHECK-NEXT: .cfi_restore b12 |
| ; CHECK-NEXT: .cfi_restore b13 |
| ; CHECK-NEXT: .cfi_restore b14 |
| ; CHECK-NEXT: .cfi_restore b15 |
| ; CHECK-NEXT: ret |
| entry: |
| %negS = fneg <2 x double> %S |
| ; First call - normal function |
| call void @normal_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C) |
| ; Second call - streaming function (requires smstart/smstop) |
| call void @streaming_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C) |
| ; Third call - normal function again |
| call void @normal_callee(ptr %X, ptr %Y, <2 x double> %C, <2 x double> %negS, <2 x double> %S, <2 x double> %C) |
| ret void |
| } |