test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-sdot.ll - llvm-project/llvm - Git at Google

 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize                                                \
 ; RUN:     -scalable-vectorization=on -mattr=+sve2                               \
 ; RUN:     -enable-epilogue-vectorization=false -debug-only=loop-vectorize       \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,SVE

 ; RUN: opt -passes=loop-vectorize                                                \
 ; RUN:     -scalable-vectorization=off -mattr=+neon,+dotprod                     \
 ; RUN:     -enable-epilogue-vectorization=false -debug-only=loop-vectorize       \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,NEON

 ; COMMON: LV: Checking a loop in 'sub_reduction'
 ; SVE:  Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
 ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))

 ; COMMON: LV: Checking a loop in 'add_sub_chained_reduction'
 ; SVE:  Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
 ; SVE:  Cost of 9 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32)))
 ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
 ; NEON: Cost of 9 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32)))

 target triple = "aarch64"

 ; Test the cost of a SUB reduction, where the SUB is implemented outside the loop
 ; and therefore not part of the partial reduction.
 define i32 @sub_reduction(ptr %arr1, ptr %arr2, i32 %init, i32 %n) #0 {
 entry:
   br label %loop

 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %acc = phi i32 [ %init, %entry ], [ %sub, %loop ]
   %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv
   %load1 = load i8, ptr %gep1
   %sext1 = sext i8 %load1 to i32
   %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv
   %load2 = load i8, ptr %gep2
   %sext2 = sext i8 %load2 to i32
   %mul = mul i32 %sext1, %sext2
   %sub = sub i32 %acc, %mul
   %iv.next = add i32 %iv, 1
   %cmp = icmp ult i32 %iv.next, %n
   br i1 %cmp, label %loop, label %exit, !llvm.loop !0

 exit:
   ret i32 %sub
 }

 ; Test that the cost of a SUB that is part of an ADD-SUB reduction chain
 ; is high, because the negation happens inside the loop and cannot be
 ; folded into the SDOT instruction (because of the extend).
 define i32 @add_sub_chained_reduction(ptr %arr1, ptr %arr2, ptr %arr3, i32 %init, i32 %n) #0 {
 entry:
   br label %loop

 loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %acc = phi i32 [ %init, %entry ], [ %sub, %loop ]
   %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv
   %load1 = load i8, ptr %gep1
   %sext1 = sext i8 %load1 to i32
   %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv
   %load2 = load i8, ptr %gep2
   %sext2 = sext i8 %load2 to i32
   %mul1 = mul i32 %sext1, %sext2
   %add = add i32 %acc, %mul1
   %gep3 = getelementptr inbounds i8, ptr %arr3, i32 %iv
   %load3 = load i8, ptr %gep3
   %sext3 = sext i8 %load3 to i32
   %mul2 = mul i32 %sext2, %sext3
   %sub = sub i32 %add, %mul2
   %iv.next = add i32 %iv, 1
   %cmp = icmp ult i32 %iv.next, %n
   br i1 %cmp, label %loop, label %exit, !llvm.loop !0

 exit:
   ret i32 %sub
 }

 attributes #0 = { vscale_range(1,16) }

 !0 = distinct !{!0, !1, !2}
 !1 = !{!"llvm.loop.interleave.count", i32 1}
 !2 = !{!"llvm.loop.vectorize.width", i32 16}
	; REQUIRES: asserts
	; RUN: opt -passes=loop-vectorize \
	; RUN: -scalable-vectorization=on -mattr=+sve2 \
	; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
	; RUN: -disable-output < %s 2>&1 \| FileCheck %s --check-prefixes=COMMON,SVE

	; RUN: opt -passes=loop-vectorize \
	; RUN: -scalable-vectorization=off -mattr=+neon,+dotprod \
	; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
	; RUN: -disable-output < %s 2>&1 \| FileCheck %s --check-prefixes=COMMON,NEON

	; COMMON: LV: Checking a loop in 'sub_reduction'
	; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
	; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))

	; COMMON: LV: Checking a loop in 'add_sub_chained_reduction'
	; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
	; SVE: Cost of 9 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32)))
	; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32))
	; NEON: Cost of 9 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32)))

	target triple = "aarch64"

	; Test the cost of a SUB reduction, where the SUB is implemented outside the loop
	; and therefore not part of the partial reduction.
	define i32 @sub_reduction(ptr %arr1, ptr %arr2, i32 %init, i32 %n) #0 {
	entry:
	br label %loop

	loop:
	%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
	%acc = phi i32 [ %init, %entry ], [ %sub, %loop ]
	%gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv
	%load1 = load i8, ptr %gep1
	%sext1 = sext i8 %load1 to i32
	%gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv
	%load2 = load i8, ptr %gep2
	%sext2 = sext i8 %load2 to i32
	%mul = mul i32 %sext1, %sext2
	%sub = sub i32 %acc, %mul
	%iv.next = add i32 %iv, 1
	%cmp = icmp ult i32 %iv.next, %n
	br i1 %cmp, label %loop, label %exit, !llvm.loop !0

	exit:
	ret i32 %sub
	}

	; Test that the cost of a SUB that is part of an ADD-SUB reduction chain
	; is high, because the negation happens inside the loop and cannot be
	; folded into the SDOT instruction (because of the extend).
	define i32 @add_sub_chained_reduction(ptr %arr1, ptr %arr2, ptr %arr3, i32 %init, i32 %n) #0 {
	entry:
	br label %loop

	loop:
	%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
	%acc = phi i32 [ %init, %entry ], [ %sub, %loop ]
	%gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv
	%load1 = load i8, ptr %gep1
	%sext1 = sext i8 %load1 to i32
	%gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv
	%load2 = load i8, ptr %gep2
	%sext2 = sext i8 %load2 to i32
	%mul1 = mul i32 %sext1, %sext2
	%add = add i32 %acc, %mul1
	%gep3 = getelementptr inbounds i8, ptr %arr3, i32 %iv
	%load3 = load i8, ptr %gep3
	%sext3 = sext i8 %load3 to i32
	%mul2 = mul i32 %sext2, %sext3
	%sub = sub i32 %add, %mul2
	%iv.next = add i32 %iv, 1
	%cmp = icmp ult i32 %iv.next, %n
	br i1 %cmp, label %loop, label %exit, !llvm.loop !0

	exit:
	ret i32 %sub
	}

	attributes #0 = { vscale_range(1,16) }

	!0 = distinct !{!0, !1, !2}
	!1 = !{!"llvm.loop.interleave.count", i32 1}
	!2 = !{!"llvm.loop.vectorize.width", i32 16}