| ; REQUIRES: asserts |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve2 \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,SVE |
| |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=off -mattr=+neon,+dotprod \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=COMMON,NEON |
| |
| ; COMMON: LV: Checking a loop in 'sub_reduction' |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) |
| ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) |
| |
| ; COMMON: LV: Checking a loop in 'add_sub_chained_reduction' |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) |
| ; SVE: Cost of 9 for VF vscale x 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) |
| ; NEON: Cost of 1 for VF 16: EXPRESSION vp<{{.*}}> = ir<%acc> + partial.reduce.add (mul (ir<%load1> sext to i32), (ir<%load2> sext to i32)) |
| ; NEON: Cost of 9 for VF 16: EXPRESSION vp<{{.*}}> = vp<%9> + partial.reduce.add (sub (0, mul (ir<%load2> sext to i32), (ir<%load3> sext to i32))) |
| |
| target triple = "aarch64" |
| |
| ; Test the cost of a SUB reduction, where the SUB is implemented outside the loop |
| ; and therefore not part of the partial reduction. |
| define i32 @sub_reduction(ptr %arr1, ptr %arr2, i32 %init, i32 %n) #0 { |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %sext1 = sext i8 %load1 to i32 |
| %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %sext2 = sext i8 %load2 to i32 |
| %mul = mul i32 %sext1, %sext2 |
| %sub = sub i32 %acc, %mul |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !0 |
| |
| exit: |
| ret i32 %sub |
| } |
| |
| ; Test that the cost of a SUB that is part of an ADD-SUB reduction chain |
| ; is high, because the negation happens inside the loop and cannot be |
| ; folded into the SDOT instruction (because of the extend). |
| define i32 @add_sub_chained_reduction(ptr %arr1, ptr %arr2, ptr %arr3, i32 %init, i32 %n) #0 { |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %arr1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %sext1 = sext i8 %load1 to i32 |
| %gep2 = getelementptr inbounds i8, ptr %arr2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %sext2 = sext i8 %load2 to i32 |
| %mul1 = mul i32 %sext1, %sext2 |
| %add = add i32 %acc, %mul1 |
| %gep3 = getelementptr inbounds i8, ptr %arr3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %sext3 = sext i8 %load3 to i32 |
| %mul2 = mul i32 %sext2, %sext3 |
| %sub = sub i32 %add, %mul2 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !0 |
| |
| exit: |
| ret i32 %sub |
| } |
| |
| attributes #0 = { vscale_range(1,16) } |
| |
| !0 = distinct !{!0, !1, !2} |
| !1 = !{!"llvm.loop.interleave.count", i32 1} |
| !2 = !{!"llvm.loop.vectorize.width", i32 16} |