| ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost.of.*EXPRESSION" --version 6 |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=off -mattr=+fp16fml \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NEON |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve2 \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2 |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve2p1 \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p1 |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve2p3 \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p3 |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve,+sme2 \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SME2 |
| ; RUN: opt -passes=loop-vectorize \ |
| ; RUN: -scalable-vectorization=on -mattr=+sve2,+i8mm \ |
| ; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \ |
| ; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=I8MM |
| |
| ; REQUIRES: asserts |
| target triple = "aarch64" |
| |
| ; sub(i16, zext(i8)->i16 * zext(i8)->i16) |
| define i16 @sub_reduction_i16_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| ; SVE-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; SVE2-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i16_zext_i8_zext_i8' |
| ; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16)) |
| ; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i16 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i16 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %zext2 = zext i8 %load2 to i16 |
| %mul12 = mul i16 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %zext3 = zext i8 %load3 to i16 |
| %mul13 = mul i16 %zext2, %zext3 |
| %add1 = add i16 %acc, %mul12 |
| %sub2 = sub i16 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !0 |
| |
| exit: |
| ret i16 %sub2 |
| } |
| |
| !0 = distinct !{!0, !1, !2} |
| !1 = !{!"llvm.loop.interleave.count", i32 1} |
| !2 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; There is no usdot for i8 -> i16, so a regular reduction is preferred due to |
| ; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. |
| ; |
| ; sub(i16, zext(i8)->i16 * sext(i8)->i16) |
| define i16 @sub_reduction_i16_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; SVE-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; SVE2-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; SME2-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| ; I8MM-LABEL: 'sub_reduction_i16_zext_i8_sext_i8' |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i16 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i16 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %sext2 = sext i8 %load2 to i16 |
| %mul12 = mul i16 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %sext3 = sext i8 %load3 to i16 |
| %mul13 = mul i16 %zext1, %sext3 |
| %add1 = add i16 %acc, %mul12 |
| %sub2 = sub i16 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !3 |
| |
| exit: |
| ret i16 %sub2 |
| } |
| |
| !3 = distinct !{!3, !4, !5} |
| !4 = !{!"llvm.loop.interleave.count", i32 1} |
| !5 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; sub(i32, zext(i8)->i32 * zext(i8)->i32) |
| define i32 @sub_reduction_i32_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { |
| ; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION). |
| ; |
| ; NEON-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SVE-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE2-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i32_zext_i8_zext_i8' |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i32 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %zext2 = zext i8 %load2 to i32 |
| %mul12 = mul i32 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %zext3 = zext i8 %load3 to i32 |
| %mul13 = mul i32 %zext2, %zext3 |
| %add1 = add i32 %acc, %mul12 |
| %sub2 = sub i32 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !6 |
| |
| exit: |
| ret i32 %sub2 |
| } |
| |
| !6 = distinct !{!6, !7, !8} |
| !7 = !{!"llvm.loop.interleave.count", i32 1} |
| !8 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; sub(i32, zext(i8)->i32 * sext(i8)->i32) |
| define i32 @sub_reduction_i32_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; SVE-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; SVE2-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; SME2-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; I8MM-LABEL: 'sub_reduction_i32_zext_i8_sext_i8' |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32))) |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i32 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %sext2 = sext i8 %load2 to i32 |
| %mul12 = mul i32 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %sext3 = sext i8 %load3 to i32 |
| %mul13 = mul i32 %zext1, %sext3 |
| %add1 = add i32 %acc, %mul12 |
| %sub2 = sub i32 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !9 |
| |
| exit: |
| ret i32 %sub2 |
| } |
| |
| !9 = distinct !{!9, !10, !11} |
| !10 = !{!"llvm.loop.interleave.count", i32 1} |
| !11 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; sub(i64, zext(i8)->i64 * zext(i8)->i64) |
| define i64 @sub_reduction_i64_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION). |
| ; |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i8_zext_i8' |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i64 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %zext2 = zext i8 %load2 to i64 |
| %mul12 = mul i64 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %zext3 = zext i8 %load3 to i64 |
| %mul13 = mul i64 %zext2, %zext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !12 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| !12 = distinct !{!12, !13, !14} |
| !13 = !{!"llvm.loop.interleave.count", i32 1} |
| !14 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; There is no usdot for i8 -> i64, so a regular reduction is preferred due to |
| ; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. |
| ; |
| ; sub(i64, zext(i8)->i64 * sext(i8)->i64) |
| define i64 @sub_reduction_i64_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i8_sext_i8' |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv |
| %load1 = load i8, ptr %gep1 |
| %zext1 = zext i8 %load1 to i64 |
| %gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv |
| %load2 = load i8, ptr %gep2 |
| %sext2 = sext i8 %load2 to i64 |
| %mul12 = mul i64 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv |
| %load3 = load i8, ptr %gep3 |
| %sext3 = sext i8 %load3 to i64 |
| %mul13 = mul i64 %zext1, %sext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !15 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| !15 = distinct !{!15, !16, !17} |
| !16 = !{!"llvm.loop.interleave.count", i32 1} |
| !17 = !{!"llvm.loop.vectorize.width", i32 16} |
| |
| ; sub(i32, zext(i16)->i32 * zext(i16)->i32) |
| define i32 @sub_reduction_i32_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; SVE2-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i32_zext_i16_zext_i16' |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32)) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv |
| %load1 = load i16, ptr %gep1 |
| %zext1 = zext i16 %load1 to i32 |
| %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv |
| %load2 = load i16, ptr %gep2 |
| %zext2 = zext i16 %load2 to i32 |
| %mul12 = mul i32 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv |
| %load3 = load i16, ptr %gep3 |
| %zext3 = zext i16 %load3 to i32 |
| %mul13 = mul i32 %zext2, %zext3 |
| %add1 = add i32 %acc, %mul12 |
| %sub2 = sub i32 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !18 |
| |
| exit: |
| ret i32 %sub2 |
| } |
| |
| !18 = distinct !{!18, !19, !20} |
| !19 = !{!"llvm.loop.interleave.count", i32 1} |
| !20 = !{!"llvm.loop.vectorize.width", i32 8} |
| |
| ; There is no usdot for i16 -> i32, so a regular reduction is preferred due to |
| ; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. |
| ; |
| ; sub(i32, zext(i16)->i32 * sext(i16)->i32) |
| define i32 @sub_reduction_i32_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; SVE-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; SVE2-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; SME2-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| ; I8MM-LABEL: 'sub_reduction_i32_zext_i16_sext_i16' |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i32 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv |
| %load1 = load i16, ptr %gep1 |
| %zext1 = zext i16 %load1 to i32 |
| %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv |
| %load2 = load i16, ptr %gep2 |
| %sext2 = sext i16 %load2 to i32 |
| %mul12 = mul i32 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv |
| %load3 = load i16, ptr %gep3 |
| %sext3 = sext i16 %load3 to i32 |
| %mul13 = mul i32 %zext1, %sext3 |
| %add1 = add i32 %acc, %mul12 |
| %sub2 = sub i32 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !21 |
| |
| exit: |
| ret i32 %sub2 |
| } |
| |
| !21 = distinct !{!21, !22, !23} |
| !22 = !{!"llvm.loop.interleave.count", i32 1} |
| !23 = !{!"llvm.loop.vectorize.width", i32 8} |
| |
| ; sub(i64, zext(i16)->i64 * zext(i16)->i64) |
| define i64 @sub_reduction_i64_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION). |
| ; |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i16_zext_i16' |
| ; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv |
| %load1 = load i16, ptr %gep1 |
| %zext1 = zext i16 %load1 to i64 |
| %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv |
| %load2 = load i16, ptr %gep2 |
| %zext2 = zext i16 %load2 to i64 |
| %mul12 = mul i64 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv |
| %load3 = load i16, ptr %gep3 |
| %zext3 = zext i16 %load3 to i64 |
| %mul13 = mul i64 %zext2, %zext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !24 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| !24 = distinct !{!24, !25, !26} |
| !25 = !{!"llvm.loop.interleave.count", i32 1} |
| !26 = !{!"llvm.loop.vectorize.width", i32 8} |
| |
| ; sub(i64, zext(i16)->i64 * sext(i16)->i64) |
| define i64 @sub_reduction_i64_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i16_sext_i16' |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv |
| %load1 = load i16, ptr %gep1 |
| %zext1 = zext i16 %load1 to i64 |
| %gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv |
| %load2 = load i16, ptr %gep2 |
| %sext2 = sext i16 %load2 to i64 |
| %mul12 = mul i64 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv |
| %load3 = load i16, ptr %gep3 |
| %sext3 = sext i16 %load3 to i64 |
| %mul13 = mul i64 %zext1, %sext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !27 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| !27 = distinct !{!27, !28, !29} |
| !28 = !{!"llvm.loop.interleave.count", i32 1} |
| !29 = !{!"llvm.loop.vectorize.width", i32 8} |
| |
| ; sub(i64, zext(i32)->i64 * zext(i32)->i64) |
| define i64 @sub_reduction_i64_zext_i32_zext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i32_zext_i32' |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64)) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv |
| %load1 = load i32, ptr %gep1 |
| %zext1 = zext i32 %load1 to i64 |
| %gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv |
| %load2 = load i32, ptr %gep2 |
| %zext2 = zext i32 %load2 to i64 |
| %mul12 = mul i64 %zext1, %zext2 |
| %gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv |
| %load3 = load i32, ptr %gep3 |
| %zext3 = zext i32 %load3 to i64 |
| %mul13 = mul i64 %zext2, %zext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !30 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| !30 = distinct !{!30, !31, !32} |
| !31 = !{!"llvm.loop.interleave.count", i32 1} |
| !32 = !{!"llvm.loop.vectorize.width", i32 4} |
| |
| ; There is no usdot for i32 -> i64, so a regular reduction is preferred due to |
| ; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION. |
| ; |
| ; sub(i64, zext(i32)->i64 * sext(i32)->i64) |
| define i64 @sub_reduction_i64_zext_i32_sext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) { |
| ; NEON-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; SVE-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; SVE2-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; SME2-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| ; I8MM-LABEL: 'sub_reduction_i64_zext_i32_sext_i32' |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi i64 [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv |
| %load1 = load i32, ptr %gep1 |
| %zext1 = zext i32 %load1 to i64 |
| %gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv |
| %load2 = load i32, ptr %gep2 |
| %sext2 = sext i32 %load2 to i64 |
| %mul12 = mul i64 %zext1, %sext2 |
| %gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv |
| %load3 = load i32, ptr %gep3 |
| %sext3 = sext i32 %load3 to i64 |
| %mul13 = mul i64 %zext1, %sext3 |
| %add1 = add i64 %acc, %mul12 |
| %sub2 = sub i64 %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !33 |
| |
| exit: |
| ret i64 %sub2 |
| } |
| |
| define i32 @reduce_sub_add_chain_without_mul(ptr %a, ptr noalias %b) { |
| ; NEON-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SVE-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SVE2-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; |
| ; SVE2p1-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SVE2p1: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SVE2p1: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; |
| ; SVE2p3-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SVE2p3: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SVE2p3: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; |
| ; SME2-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; SME2: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SME2: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; |
| ; I8MM-LABEL: 'reduce_sub_add_chain_without_mul' |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32) |
| ; |
| entry: |
| br label %for.body |
| |
| for.body: |
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] |
| %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] |
| %gep.a = getelementptr i16, ptr %a, i64 %iv |
| %load.a = load i16, ptr %gep.a, align 1 |
| %ext.a = sext i16 %load.a to i32 |
| %gep.b = getelementptr i16, ptr %b, i64 %iv |
| %load.b = load i16, ptr %gep.b, align 1 |
| %ext.b = sext i16 %load.b to i32 |
| %sub = sub i32 %accum, %ext.a |
| %add = add i32 %sub, %ext.b |
| %iv.next = add i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, 1025 |
| br i1 %exitcond.not, label %for.exit, label %for.body |
| |
| for.exit: |
| ret i32 %add |
| } |
| |
| define float @reduce_fsub_fadd_chain_without_mul(ptr %a, ptr noalias %b) { |
| ; NEON-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| ; SVE-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; SVE2-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; SVE2: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| ; SVE2p1-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; SVE2p1: Cost of 2 for VF 4: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p1: Cost of 1 for VF 4: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p1: Cost of 2 for VF 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p1: Cost of 1 for VF 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| ; SVE2p3-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; SVE2p3: Cost of 2 for VF 4: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p3: Cost of 1 for VF 4: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p3: Cost of 2 for VF 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p3: Cost of 1 for VF 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| ; SME2-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; SME2: Cost of 4 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SME2: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; SME2: Cost of 4 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| ; I8MM-LABEL: 'reduce_fsub_fadd_chain_without_mul' |
| ; I8MM: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; I8MM: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float) |
| ; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float) |
| ; |
| entry: |
| br label %for.body |
| |
| for.body: |
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] |
| %accum = phi float [ -0.0, %entry ], [ %add, %for.body ] |
| %gep.a = getelementptr half, ptr %a, i64 %iv |
| %load.a = load half, ptr %gep.a, align 1 |
| %ext.a = fpext half %load.a to float |
| %gep.b = getelementptr half, ptr %b, i64 %iv |
| %load.b = load half, ptr %gep.b, align 1 |
| %ext.b = fpext half %load.b to float |
| %sub = fsub reassoc contract float %accum, %ext.a |
| %add = fadd reassoc contract float %sub, %ext.b |
| %iv.next = add i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, 1025 |
| br i1 %exitcond.not, label %for.exit, label %for.body |
| |
| for.exit: |
| ret float %add |
| } |
| |
| !33 = distinct !{!33, !34, !35} |
| !34 = !{!"llvm.loop.interleave.count", i32 1} |
| !35 = !{!"llvm.loop.vectorize.width", i32 4} |
| |
| ; fsub(float, fpext(half)->float * fpext(half)->float) |
| define float @sub_reduction_float_fpext_half_fpext_half(ptr %src1, ptr %src2, ptr %src3, float %init, i32 %n) { |
| ; For the SVE run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION). |
| ; |
| ; NEON-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; SVE2-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; SME2: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SME2: Cost of 6 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SME2: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SME2: Cost of 6 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_float_fpext_half_fpext_half' |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi float [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds half, ptr %src1, i32 %iv |
| %load1 = load half, ptr %gep1 |
| %fpext1 = fpext half %load1 to float |
| %gep2 = getelementptr inbounds half, ptr %src2, i32 %iv |
| %load2 = load half, ptr %gep2 |
| %fpext2 = fpext half %load2 to float |
| %mul12 = fmul float %fpext1, %fpext2 |
| %gep3 = getelementptr inbounds half, ptr %src3, i32 %iv |
| %load3 = load half, ptr %gep3 |
| %fpext3 = fpext half %load3 to float |
| %mul13 = fmul float %fpext2, %fpext3 |
| %add1 = fadd fast float %acc, %mul12 |
| %sub2 = fsub fast float %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !36 |
| |
| exit: |
| ret float %sub2 |
| } |
| |
| ; fsub(float, fpext(bfloat)->float * fpext(bfloat)->float) |
| define float @sub_reduction_float_fpext_bfloat_fpext_bfloat(ptr %src1, ptr %src2, ptr %src3, float %init, i32 %n) "target-features"="+bf16" { |
| ; NEON-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; NEON: Cost of 3 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; NEON: Cost of 3 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; SVE: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE2-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE2p1-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SVE2p3-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; SME2-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| ; I8MM-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat' |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; I8MM: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float)) |
| ; I8MM: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float))) |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] |
| %acc = phi float [ %init, %entry ], [ %sub2, %loop ] |
| %gep1 = getelementptr inbounds bfloat, ptr %src1, i32 %iv |
| %load1 = load bfloat, ptr %gep1 |
| %fpext1 = fpext bfloat %load1 to float |
| %gep2 = getelementptr inbounds bfloat, ptr %src2, i32 %iv |
| %load2 = load bfloat, ptr %gep2 |
| %fpext2 = fpext bfloat %load2 to float |
| %mul12 = fmul float %fpext1, %fpext2 |
| %gep3 = getelementptr inbounds bfloat, ptr %src3, i32 %iv |
| %load3 = load bfloat, ptr %gep3 |
| %fpext3 = fpext bfloat %load3 to float |
| %mul13 = fmul float %fpext2, %fpext3 |
| %add1 = fadd fast float %acc, %mul12 |
| %sub2 = fsub fast float %add1, %mul13 |
| %iv.next = add i32 %iv, 1 |
| %cmp = icmp ult i32 %iv.next, %n |
| br i1 %cmp, label %loop, label %exit, !llvm.loop !36 |
| |
| exit: |
| ret float %sub2 |
| } |
| |
| !36 = distinct !{!36, !37, !38} |
| !37 = !{!"llvm.loop.interleave.count", i32 1} |
| !38 = !{!"llvm.loop.vectorize.width", i32 4} |