test/CodeGen/AArch64/sched-past-vector-ldst.ll - llvm-project/llvm - Git at Google

 ; RUN: llc < %s -mcpu=cortex-a53 -enable-post-misched=false -enable-aa-sched-mi | FileCheck %s

 ; Check that the vector store intrinsic does not prevent fmla instructions from
 ; being scheduled together.  Since the vector loads and stores generated from
 ; the intrinsics do not alias each other, the store can be pushed past the load.
 ; This allows fmla instructions to be scheduled together.


 ; CHECK: fmla
 ; CHECK-NEXT: mov
 ; CHECK-NEXT: mov
 ; CHECK-NEXT: fmla
 ; CHECK-NEXT: fmla
 ; CHECK-NEXT: fmla
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-gnu"

 %Struct = type { ptr, [9 x double], [16 x {float, float}], [16 x {float, float}], i32, i32 }

 ; Function Attrs: nounwind
 define linkonce_odr void @func(ptr nocapture %this, <4 x float> %f) unnamed_addr #0 align 2 {
 entry:
   %scevgep = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 8, i32 0
   %vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep)
   %ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1
   %fm1 = fmul <4 x float> %f, %ev1
   %av1 = fadd <4 x float> %f, %fm1
   %ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0
   %fm2 = fmul <4 x float> %f, %ev2
   %av2 = fadd <4 x float> %f, %fm2
   %scevgep2 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 8, i32 0
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av2, <4 x float> %av1, ptr %scevgep2)
   %scevgep3 = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 12, i32 0
   %vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep3)
   %ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1
   %fm3 = fmul <4 x float> %f, %ev3
   %av3 = fadd <4 x float> %f, %fm3
   %ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0
   %fm4 = fmul <4 x float> %f, %ev4
   %av4 = fadd <4 x float> %f, %fm4
   %scevgep4 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 12, i32 0
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av4, <4 x float> %av3, ptr %scevgep4)
   ret void
 }

 ; Function Attrs: nounwind readonly
 declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr) #2

 ; Function Attrs: nounwind
 declare void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float>, <4 x float>, ptr nocapture) #1

 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
	; RUN: llc < %s -mcpu=cortex-a53 -enable-post-misched=false -enable-aa-sched-mi \| FileCheck %s

	; Check that the vector store intrinsic does not prevent fmla instructions from
	; being scheduled together. Since the vector loads and stores generated from
	; the intrinsics do not alias each other, the store can be pushed past the load.
	; This allows fmla instructions to be scheduled together.


	; CHECK: fmla
	; CHECK-NEXT: mov
	; CHECK-NEXT: mov
	; CHECK-NEXT: fmla
	; CHECK-NEXT: fmla
	; CHECK-NEXT: fmla
	target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
	target triple = "aarch64--linux-gnu"

	%Struct = type { ptr, [9 x double], [16 x {float, float}], [16 x {float, float}], i32, i32 }

	; Function Attrs: nounwind
	define linkonce_odr void @func(ptr nocapture %this, <4 x float> %f) unnamed_addr #0 align 2 {
	entry:
	%scevgep = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 8, i32 0
	%vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep)
	%ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1
	%fm1 = fmul <4 x float> %f, %ev1
	%av1 = fadd <4 x float> %f, %fm1
	%ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0
	%fm2 = fmul <4 x float> %f, %ev2
	%av2 = fadd <4 x float> %f, %fm2
	%scevgep2 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 8, i32 0
	tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av2, <4 x float> %av1, ptr %scevgep2)
	%scevgep3 = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 12, i32 0
	%vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep3)
	%ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1
	%fm3 = fmul <4 x float> %f, %ev3
	%av3 = fadd <4 x float> %f, %fm3
	%ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0
	%fm4 = fmul <4 x float> %f, %ev4
	%av4 = fadd <4 x float> %f, %fm4
	%scevgep4 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 12, i32 0
	tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av4, <4 x float> %av3, ptr %scevgep4)
	ret void
	}

	; Function Attrs: nounwind readonly
	declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr) #2

	; Function Attrs: nounwind
	declare void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float>, <4 x float>, ptr nocapture) #1

	attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
	attributes #1 = { nounwind }
	attributes #2 = { nounwind readonly }