| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s |
| |
| declare void @normal_callee(); |
| declare void @streaming_callee() "aarch64_pstate_sm_enabled"; |
| declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; |
| |
| define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_caller_streaming_callee: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: bl streaming_compatible_callee |
| ; CHECK-NEXT: bl streaming_compatible_callee |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| |
| call void @streaming_compatible_callee(); |
| call void @streaming_compatible_callee(); |
| ret void; |
| } |
| |
| ; Test that a streaming body and streaming interface, no smstart/smstop are emitted, |
| ; because the function already is in streaming mode upon entry. |
| define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: bl streaming_callee |
| ; CHECK-NEXT: bl streaming_callee |
| ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| call void @streaming_callee(); |
| call void @streaming_callee(); |
| ret void; |
| } |
| |
| define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_multiple_exit: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: cmp x0, #1 |
| ; CHECK-NEXT: b.ne .LBB2_2 |
| ; CHECK-NEXT: // %bb.1: // %if.else |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| ; CHECK-NEXT: .LBB2_2: // %if.end |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| |
| entry: |
| %tobool = icmp eq i64 %cond, 1 |
| br i1 %tobool, label %if.else, label %if.end |
| |
| if.else: |
| ret void; |
| |
| if.end: |
| ret void; |
| } |
| |
| ; Do a fixed-width vector add on a NEON vector. |
| ; This tests that: |
| ; * Incoming vector in v0.d isn't clobbered by the change in streaming mode. |
| ; * Result vector is correctly preserved after smstop. |
| define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_caller_no_callee: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill |
| ; CHECK-NEXT: addsvl sp, sp, #-1 |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 |
| ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: index z0.d, #0, #1 |
| ; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload |
| ; CHECK-NEXT: add z0.d, z0.d, z1.d |
| ; CHECK-NEXT: add z0.d, z0.d, #41 // =0x29 |
| ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 |
| ; CHECK-NEXT: addsvl sp, sp, #1 |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| |
| %add = add <2 x i64> %a, <i64 41, i64 42>; |
| ret <2 x i64> %add; |
| } |
| |
| ; Test that we use the interface (not the function's body) to determine what |
| ; streaming-mode to enter the callee. In this case the interface is normal, so |
| ; pstate.sm must be 0 on entry and is 0 upon return from the callee. |
| define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| |
| call void @locally_streaming_caller_streaming_callee(); |
| ret void; |
| } |
| |
| ; |
| ; Test that a locally streaming function correctly retains the |
| ; argument/result registers, because smstart/smstop instructions that are |
| ; inserted to implement the arm_locally_streaming attribute thrashes the |
| ; vector register contents. |
| ; |
| |
| define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sub sp, sp, #96 |
| ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill |
| ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload |
| ; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret |
| ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #96 |
| ; CHECK-NEXT: ret |
| %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible" |
| ret <2 x i64> %res; |
| } |
| |
| declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible" |
| |
| define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sub sp, sp, #112 |
| ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill |
| ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret |
| ; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #112 |
| ; CHECK-NEXT: ret |
| %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 |
| %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" |
| ret {<2 x i64>, <2 x i64>} %res; |
| } |
| |
| declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible" |
| |
| ; Test that we use `addsvl` for allocating any stack space for locals before `smstart`, |
| ; such that the correct amount of stack space is allocated. |
| define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" { |
| ; CHECK-LABEL: locally_streaming_caller_alloca: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill |
| ; CHECK-NEXT: addsvl sp, sp, #-1 |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: mov x0, sp |
| ; CHECK-NEXT: bl use_ptr |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: addsvl sp, sp, #1 |
| ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| %alloca = alloca <vscale x 4 x i32> |
| call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible" |
| ret void |
| } |
| |
| declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" |
| |
| define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { |
| ; CHECK-LABEL: call_to_intrinsic_without_chain: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: sub sp, sp, #96 |
| ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill |
| ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload |
| ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload |
| ; CHECK-NEXT: bl cos |
| ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload |
| ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #96 |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = call fast double @llvm.cos.f64(double %x) |
| ret double %0 |
| } |
| |
| declare double @llvm.cos.f64(double) |
| |
| |
| define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" { |
| ; CHECK-LABEL: test_arg_survives_loop: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: sub sp, sp, #80 |
| ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill |
| ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: .LBB9_1: // %for.body |
| ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: subs w0, w0, #1 |
| ; CHECK-NEXT: b.ne .LBB9_1 |
| ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup |
| ; CHECK-NEXT: fmov s0, #1.00000000 |
| ; CHECK-NEXT: ldr s1, [sp, #12] // 4-byte Folded Reload |
| ; CHECK-NEXT: fadd s0, s1, s0 |
| ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: add sp, sp, #80 |
| ; CHECK-NEXT: ret |
| entry: |
| br label %for.body |
| |
| for.body: |
| %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ] |
| %inc = add nuw nsw i32 %i.02, 1 |
| %exitcond.not = icmp eq i32 %inc, %N |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: |
| %add = fadd float %arg, 1.000000e+00 |
| ret float %add |
| |
| } |
| |
| define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind { |
| ; CHECK-LABEL: disable_tailcallopt: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill |
| ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill |
| ; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill |
| ; CHECK-NEXT: smstart sm |
| ; CHECK-NEXT: bl streaming_compatible_callee |
| ; CHECK-NEXT: smstop sm |
| ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload |
| ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload |
| ; CHECK-NEXT: ret |
| tail call void @streaming_compatible_callee(); |
| ret void; |
| } |