blob: ea6a001f8f9e6c83db4a75b350b59ff6ff18054f [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "Cost.of.*EXPRESSION" --version 6
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=off -mattr=+fp16fml \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NEON
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve2 \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve2p1 \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p1
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve2p3 \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SVE2p3
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve,+sme2 \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SME2
; RUN: opt -passes=loop-vectorize \
; RUN: -scalable-vectorization=on -mattr=+sve2,+i8mm \
; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize \
; RUN: -disable-output < %s 2>&1 | FileCheck %s --check-prefix=I8MM
; REQUIRES: asserts
target triple = "aarch64"
; sub(i16, zext(i8)->i16 * zext(i8)->i16)
define i16 @sub_reduction_i16_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; NEON: Cost of 2 for VF 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
; SVE-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; SVE2-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2p1: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SVE2p3: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
; SME2-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; SME2: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
; I8MM-LABEL: 'sub_reduction_i16_zext_i8_zext_i8'
; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i16), (ir<%load2> zext to i16))
; I8MM: Cost of 2 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i16), (ir<%load3> zext to i16)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i16 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i16
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%zext2 = zext i8 %load2 to i16
%mul12 = mul i16 %zext1, %zext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%zext3 = zext i8 %load3 to i16
%mul13 = mul i16 %zext2, %zext3
%add1 = add i16 %acc, %mul12
%sub2 = sub i16 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !0
exit:
ret i16 %sub2
}
!0 = distinct !{!0, !1, !2}
!1 = !{!"llvm.loop.interleave.count", i32 1}
!2 = !{!"llvm.loop.vectorize.width", i32 16}
; There is no usdot for i8 -> i16, so a regular reduction is preferred due to
; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION.
;
; sub(i16, zext(i8)->i16 * sext(i8)->i16)
define i16 @sub_reduction_i16_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i16 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; SVE-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; SVE2-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; SVE2p1-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; SVE2p3-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; SME2-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
; I8MM-LABEL: 'sub_reduction_i16_zext_i8_sext_i8'
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i16 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i16
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%sext2 = sext i8 %load2 to i16
%mul12 = mul i16 %zext1, %sext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%sext3 = sext i8 %load3 to i16
%mul13 = mul i16 %zext1, %sext3
%add1 = add i16 %acc, %mul12
%sub2 = sub i16 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !3
exit:
ret i16 %sub2
}
!3 = distinct !{!3, !4, !5}
!4 = !{!"llvm.loop.interleave.count", i32 1}
!5 = !{!"llvm.loop.vectorize.width", i32 16}
; sub(i32, zext(i8)->i32 * zext(i8)->i32)
define i32 @sub_reduction_i32_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) {
; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION).
;
; NEON-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SVE-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE2-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SME2-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; I8MM-LABEL: 'sub_reduction_i32_zext_i8_zext_i8'
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i32 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i32
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%zext2 = zext i8 %load2 to i32
%mul12 = mul i32 %zext1, %zext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%zext3 = zext i8 %load3 to i32
%mul13 = mul i32 %zext2, %zext3
%add1 = add i32 %acc, %mul12
%sub2 = sub i32 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !6
exit:
ret i32 %sub2
}
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.interleave.count", i32 1}
!8 = !{!"llvm.loop.vectorize.width", i32 16}
; sub(i32, zext(i8)->i32 * sext(i8)->i32)
define i32 @sub_reduction_i32_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; SVE-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; SVE2-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; SVE2p1-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; SVE2p3-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; SME2-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; I8MM-LABEL: 'sub_reduction_i32_zext_i8_sext_i8'
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32)))
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> sext to i32))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load1> zext to i32), (ir<%load3> sext to i32)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i32 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i32
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%sext2 = sext i8 %load2 to i32
%mul12 = mul i32 %zext1, %sext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%sext3 = sext i8 %load3 to i32
%mul13 = mul i32 %zext1, %sext3
%add1 = add i32 %acc, %mul12
%sub2 = sub i32 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !9
exit:
ret i32 %sub2
}
!9 = distinct !{!9, !10, !11}
!10 = !{!"llvm.loop.interleave.count", i32 1}
!11 = !{!"llvm.loop.vectorize.width", i32 16}
; sub(i64, zext(i8)->i64 * zext(i8)->i64)
define i64 @sub_reduction_i64_zext_i8_zext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION).
;
; NEON-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SVE-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p1: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p3: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SME2-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SME2: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; I8MM-LABEL: 'sub_reduction_i64_zext_i8_zext_i8'
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; I8MM: Cost of 1 for VF vscale x 16: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 3 for VF vscale x 16: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i64
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%zext2 = zext i8 %load2 to i64
%mul12 = mul i64 %zext1, %zext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%zext3 = zext i8 %load3 to i64
%mul13 = mul i64 %zext2, %zext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !12
exit:
ret i64 %sub2
}
!12 = distinct !{!12, !13, !14}
!13 = !{!"llvm.loop.interleave.count", i32 1}
!14 = !{!"llvm.loop.vectorize.width", i32 16}
; There is no usdot for i8 -> i64, so a regular reduction is preferred due to
; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION.
;
; sub(i64, zext(i8)->i64 * sext(i8)->i64)
define i64 @sub_reduction_i64_zext_i8_sext_i8(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; SVE-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; SVE2-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; SME2-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
; I8MM-LABEL: 'sub_reduction_i64_zext_i8_sext_i8'
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i8, ptr %src1, i32 %iv
%load1 = load i8, ptr %gep1
%zext1 = zext i8 %load1 to i64
%gep2 = getelementptr inbounds i8, ptr %src2, i32 %iv
%load2 = load i8, ptr %gep2
%sext2 = sext i8 %load2 to i64
%mul12 = mul i64 %zext1, %sext2
%gep3 = getelementptr inbounds i8, ptr %src3, i32 %iv
%load3 = load i8, ptr %gep3
%sext3 = sext i8 %load3 to i64
%mul13 = mul i64 %zext1, %sext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !15
exit:
ret i64 %sub2
}
!15 = distinct !{!15, !16, !17}
!16 = !{!"llvm.loop.interleave.count", i32 1}
!17 = !{!"llvm.loop.vectorize.width", i32 16}
; sub(i32, zext(i16)->i32 * zext(i16)->i32)
define i32 @sub_reduction_i32_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; SVE2-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; SME2-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
; I8MM-LABEL: 'sub_reduction_i32_zext_i16_zext_i16'
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i32), (ir<%load2> zext to i32))
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i32), (ir<%load3> zext to i32)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i32 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv
%load1 = load i16, ptr %gep1
%zext1 = zext i16 %load1 to i32
%gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv
%load2 = load i16, ptr %gep2
%zext2 = zext i16 %load2 to i32
%mul12 = mul i32 %zext1, %zext2
%gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv
%load3 = load i16, ptr %gep3
%zext3 = zext i16 %load3 to i32
%mul13 = mul i32 %zext2, %zext3
%add1 = add i32 %acc, %mul12
%sub2 = sub i32 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !18
exit:
ret i32 %sub2
}
!18 = distinct !{!18, !19, !20}
!19 = !{!"llvm.loop.interleave.count", i32 1}
!20 = !{!"llvm.loop.vectorize.width", i32 8}
; There is no usdot for i16 -> i32, so a regular reduction is preferred due to
; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION.
;
; sub(i32, zext(i16)->i32 * sext(i16)->i32)
define i32 @sub_reduction_i32_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i32 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; SVE-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; SVE2-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; SVE2p1-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; SVE2p3-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; SME2-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
; I8MM-LABEL: 'sub_reduction_i32_zext_i16_sext_i16'
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i32 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv
%load1 = load i16, ptr %gep1
%zext1 = zext i16 %load1 to i32
%gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv
%load2 = load i16, ptr %gep2
%sext2 = sext i16 %load2 to i32
%mul12 = mul i32 %zext1, %sext2
%gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv
%load3 = load i16, ptr %gep3
%sext3 = sext i16 %load3 to i32
%mul13 = mul i32 %zext1, %sext3
%add1 = add i32 %acc, %mul12
%sub2 = sub i32 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !21
exit:
ret i32 %sub2
}
!21 = distinct !{!21, !22, !23}
!22 = !{!"llvm.loop.interleave.count", i32 1}
!23 = !{!"llvm.loop.vectorize.width", i32 8}
; sub(i64, zext(i16)->i64 * zext(i16)->i64)
define i64 @sub_reduction_i64_zext_i16_zext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; For the NEON run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION).
;
; NEON-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SVE-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SME2-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; I8MM-LABEL: 'sub_reduction_i64_zext_i16_zext_i16'
; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; I8MM: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 3 for VF vscale x 8: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv
%load1 = load i16, ptr %gep1
%zext1 = zext i16 %load1 to i64
%gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv
%load2 = load i16, ptr %gep2
%zext2 = zext i16 %load2 to i64
%mul12 = mul i64 %zext1, %zext2
%gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv
%load3 = load i16, ptr %gep3
%zext3 = zext i16 %load3 to i64
%mul13 = mul i64 %zext2, %zext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !24
exit:
ret i64 %sub2
}
!24 = distinct !{!24, !25, !26}
!25 = !{!"llvm.loop.interleave.count", i32 1}
!26 = !{!"llvm.loop.vectorize.width", i32 8}
; sub(i64, zext(i16)->i64 * sext(i16)->i64)
define i64 @sub_reduction_i64_zext_i16_sext_i16(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; SVE-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; SVE2-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; SME2-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
; I8MM-LABEL: 'sub_reduction_i64_zext_i16_sext_i16'
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i16, ptr %src1, i32 %iv
%load1 = load i16, ptr %gep1
%zext1 = zext i16 %load1 to i64
%gep2 = getelementptr inbounds i16, ptr %src2, i32 %iv
%load2 = load i16, ptr %gep2
%sext2 = sext i16 %load2 to i64
%mul12 = mul i64 %zext1, %sext2
%gep3 = getelementptr inbounds i16, ptr %src3, i32 %iv
%load3 = load i16, ptr %gep3
%sext3 = sext i16 %load3 to i64
%mul13 = mul i64 %zext1, %sext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !27
exit:
ret i64 %sub2
}
!27 = distinct !{!27, !28, !29}
!28 = !{!"llvm.loop.interleave.count", i32 1}
!29 = !{!"llvm.loop.vectorize.width", i32 8}
; sub(i64, zext(i32)->i64 * zext(i32)->i64)
define i64 @sub_reduction_i64_zext_i32_zext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; SVE2-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; SME2-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
; I8MM-LABEL: 'sub_reduction_i64_zext_i32_zext_i32'
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.add (mul (ir<%load1> zext to i64), (ir<%load2> zext to i64))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.add (sub (0, mul (ir<%load2> zext to i64), (ir<%load3> zext to i64)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv
%load1 = load i32, ptr %gep1
%zext1 = zext i32 %load1 to i64
%gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv
%load2 = load i32, ptr %gep2
%zext2 = zext i32 %load2 to i64
%mul12 = mul i64 %zext1, %zext2
%gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv
%load3 = load i32, ptr %gep3
%zext3 = zext i32 %load3 to i64
%mul13 = mul i64 %zext2, %zext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !30
exit:
ret i64 %sub2
}
!30 = distinct !{!30, !31, !32}
!31 = !{!"llvm.loop.interleave.count", i32 1}
!32 = !{!"llvm.loop.vectorize.width", i32 4}
; There is no usdot for i32 -> i64, so a regular reduction is preferred due to
; high expansion cost, hence no costs for a partial.reduce.add EXPRESSION.
;
; sub(i64, zext(i32)->i64 * sext(i32)->i64)
define i64 @sub_reduction_i64_zext_i32_sext_i32(ptr %src1, ptr %src2, ptr %src3, i64 %init, i32 %n) {
; NEON-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; SVE-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; SVE2-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; SVE2p1-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; SVE2p3-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; SME2-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
; I8MM-LABEL: 'sub_reduction_i64_zext_i32_sext_i32'
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi i64 [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds i32, ptr %src1, i32 %iv
%load1 = load i32, ptr %gep1
%zext1 = zext i32 %load1 to i64
%gep2 = getelementptr inbounds i32, ptr %src2, i32 %iv
%load2 = load i32, ptr %gep2
%sext2 = sext i32 %load2 to i64
%mul12 = mul i64 %zext1, %sext2
%gep3 = getelementptr inbounds i32, ptr %src3, i32 %iv
%load3 = load i32, ptr %gep3
%sext3 = sext i32 %load3 to i64
%mul13 = mul i64 %zext1, %sext3
%add1 = add i64 %acc, %mul12
%sub2 = sub i64 %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !33
exit:
ret i64 %sub2
}
define i32 @reduce_sub_add_chain_without_mul(ptr %a, ptr noalias %b) {
; NEON-LABEL: 'reduce_sub_add_chain_without_mul'
; SVE-LABEL: 'reduce_sub_add_chain_without_mul'
; SVE2-LABEL: 'reduce_sub_add_chain_without_mul'
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
;
; SVE2p1-LABEL: 'reduce_sub_add_chain_without_mul'
; SVE2p1: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SVE2p1: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
;
; SVE2p3-LABEL: 'reduce_sub_add_chain_without_mul'
; SVE2p3: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SVE2p3: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
;
; SME2-LABEL: 'reduce_sub_add_chain_without_mul'
; SME2: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SME2: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
; SME2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
;
; I8MM-LABEL: 'reduce_sub_add_chain_without_mul'
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.add (sub (0, ir<%load.a>) sext to i32)
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.add (ir<%load.b> sext to i32)
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%gep.a = getelementptr i16, ptr %a, i64 %iv
%load.a = load i16, ptr %gep.a, align 1
%ext.a = sext i16 %load.a to i32
%gep.b = getelementptr i16, ptr %b, i64 %iv
%load.b = load i16, ptr %gep.b, align 1
%ext.b = sext i16 %load.b to i32
%sub = sub i32 %accum, %ext.a
%add = add i32 %sub, %ext.b
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1025
br i1 %exitcond.not, label %for.exit, label %for.body
for.exit:
ret i32 %add
}
define float @reduce_fsub_fadd_chain_without_mul(ptr %a, ptr noalias %b) {
; NEON-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; NEON: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
; SVE-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; SVE2-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; SVE2: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
; SVE2p1-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; SVE2p1: Cost of 2 for VF 4: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p1: Cost of 1 for VF 4: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p1: Cost of 2 for VF 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p1: Cost of 1 for VF 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p1: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p1: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
; SVE2p3-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; SVE2p3: Cost of 2 for VF 4: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p3: Cost of 1 for VF 4: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p3: Cost of 2 for VF 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p3: Cost of 1 for VF 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SVE2p3: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SVE2p3: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
; SME2-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; SME2: Cost of 4 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SME2: Cost of 1 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; SME2: Cost of 4 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; SME2: Cost of 1 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
; I8MM-LABEL: 'reduce_fsub_fadd_chain_without_mul'
; I8MM: Cost of 2 for VF 8: EXPRESSION vp<[[VP8:%[0-9]+]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; I8MM: Cost of 2 for VF 8: EXPRESSION vp<[[VP9:%[0-9]+]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP8]]> = ir<%accum> + partial.reduce.fadd (fneg(ir<%load.a>) reassoc contract fpext to float)
; I8MM: Cost of 2 for VF vscale x 8: EXPRESSION vp<[[VP9]]> = vp<[[VP8]]> + partial.reduce.fadd (ir<%load.b> reassoc contract fpext to float)
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi float [ -0.0, %entry ], [ %add, %for.body ]
%gep.a = getelementptr half, ptr %a, i64 %iv
%load.a = load half, ptr %gep.a, align 1
%ext.a = fpext half %load.a to float
%gep.b = getelementptr half, ptr %b, i64 %iv
%load.b = load half, ptr %gep.b, align 1
%ext.b = fpext half %load.b to float
%sub = fsub reassoc contract float %accum, %ext.a
%add = fadd reassoc contract float %sub, %ext.b
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1025
br i1 %exitcond.not, label %for.exit, label %for.body
for.exit:
ret float %add
}
!33 = distinct !{!33, !34, !35}
!34 = !{!"llvm.loop.interleave.count", i32 1}
!35 = !{!"llvm.loop.vectorize.width", i32 4}
; fsub(float, fpext(half)->float * fpext(half)->float)
define float @sub_reduction_float_fpext_half_fpext_half(ptr %src1, ptr %src2, ptr %src3, float %init, i32 %n) {
; For the SVE run line, no partial reductions are generated because the operation is not natively supported (hence no check line for EXPRESSION).
;
; NEON-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; SVE2-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE2p1-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2p1: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE2p3-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2p3: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SME2-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; SME2: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SME2: Cost of 6 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SME2: Cost of 1 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SME2: Cost of 6 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; I8MM-LABEL: 'sub_reduction_float_fpext_half_fpext_half'
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi float [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds half, ptr %src1, i32 %iv
%load1 = load half, ptr %gep1
%fpext1 = fpext half %load1 to float
%gep2 = getelementptr inbounds half, ptr %src2, i32 %iv
%load2 = load half, ptr %gep2
%fpext2 = fpext half %load2 to float
%mul12 = fmul float %fpext1, %fpext2
%gep3 = getelementptr inbounds half, ptr %src3, i32 %iv
%load3 = load half, ptr %gep3
%fpext3 = fpext half %load3 to float
%mul13 = fmul float %fpext2, %fpext3
%add1 = fadd fast float %acc, %mul12
%sub2 = fsub fast float %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !36
exit:
ret float %sub2
}
; fsub(float, fpext(bfloat)->float * fpext(bfloat)->float)
define float @sub_reduction_float_fpext_bfloat_fpext_bfloat(ptr %src1, ptr %src2, ptr %src3, float %init, i32 %n) "target-features"="+bf16" {
; NEON-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; NEON: Cost of 3 for VF 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; NEON: Cost of 2 for VF 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; NEON: Cost of 3 for VF 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; SVE: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE2-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE2p1-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p1: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SVE2p3-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SVE2p3: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; SME2-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; SME2: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
; I8MM-LABEL: 'sub_reduction_float_fpext_bfloat_fpext_bfloat'
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10:%[0-9]+]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; I8MM: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11:%[0-9]+]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
; I8MM: Cost of 2 for VF vscale x 4: EXPRESSION vp<[[VP10]]> = ir<%acc> + partial.reduce.fadd (mul (ir<%load1> fpext to float), (ir<%load2> fpext to float))
; I8MM: Cost of 3 for VF vscale x 4: EXPRESSION vp<[[VP11]]> = vp<[[VP10]]> + partial.reduce.fadd (sub (0, mul (ir<%load2> fpext to float), (ir<%load3> fpext to float)))
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
%acc = phi float [ %init, %entry ], [ %sub2, %loop ]
%gep1 = getelementptr inbounds bfloat, ptr %src1, i32 %iv
%load1 = load bfloat, ptr %gep1
%fpext1 = fpext bfloat %load1 to float
%gep2 = getelementptr inbounds bfloat, ptr %src2, i32 %iv
%load2 = load bfloat, ptr %gep2
%fpext2 = fpext bfloat %load2 to float
%mul12 = fmul float %fpext1, %fpext2
%gep3 = getelementptr inbounds bfloat, ptr %src3, i32 %iv
%load3 = load bfloat, ptr %gep3
%fpext3 = fpext bfloat %load3 to float
%mul13 = fmul float %fpext2, %fpext3
%add1 = fadd fast float %acc, %mul12
%sub2 = fsub fast float %add1, %mul13
%iv.next = add i32 %iv, 1
%cmp = icmp ult i32 %iv.next, %n
br i1 %cmp, label %loop, label %exit, !llvm.loop !36
exit:
ret float %sub2
}
!36 = distinct !{!36, !37, !38}
!37 = !{!"llvm.loop.interleave.count", i32 1}
!38 = !{!"llvm.loop.vectorize.width", i32 4}