[AArch64] Fix condition for "high-vector" DUP optimizations.
AArch64 NEON has a bunch of instructions with a "2" suffix that extract
the top half of the source vectors, instead of the bottom half. We have
some DAGCombines to try to take advantage of that. However, they
assumed that any EXTRACT_VECTOR was extracting the high half of the
vector in question.
This issue has apparently existed since the AArch64 backend was merged.
Fixes https://bugs.llvm.org/show_bug.cgi?id=40632 .
Differential Revision: https://reviews.llvm.org/D57862
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353486 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72a3549..cfca92d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9722,12 +9722,13 @@
DAG.getConstant(NumElems, dl, MVT::i64));
}
-static bool isEssentiallyExtractSubvector(SDValue N) {
- if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- return true;
-
- return N.getOpcode() == ISD::BITCAST &&
- N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+static bool isEssentiallyExtractHighSubvector(SDValue N) {
+ if (N.getOpcode() == ISD::BITCAST)
+ N = N.getOperand(0);
+ if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+ return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+ N.getOperand(0).getValueType().getVectorNumElements() / 2;
}
/// Helper structure to keep track of ISD::SET_CC operands.
@@ -9894,13 +9895,13 @@
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
- if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+ if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
- } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+ } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
@@ -9933,11 +9934,11 @@
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
- if (isEssentiallyExtractSubvector(LHS)) {
+ if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
- } else if (isEssentiallyExtractSubvector(RHS)) {
+ } else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index 53669a1..c6af680 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -885,6 +885,20 @@
define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uabdl_from_extract_dup:
; CHECK-NOT: ext.16b
+; CHECK: uabdl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
; CHECK: uabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
@@ -899,6 +913,20 @@
define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: sabdl_from_extract_dup:
; CHECK-NOT: ext.16b
+; CHECK: sabdl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl2_from_extract_dup:
+; CHECK-NOT: ext.16b
; CHECK: sabdl2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
%rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
index 2a25538..a244a07 100644
--- a/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -738,6 +738,22 @@
declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: uaddl2_duprhs
; CHECK-NOT: ext.16b
@@ -754,6 +770,22 @@
ret <2 x i64> %res
}
+define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
; CHECK-LABEL: saddl2_duplhs
; CHECK-NOT: ext.16b
@@ -770,6 +802,22 @@
ret <2 x i64> %res
}
+define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
; CHECK-LABEL: usubl2_duprhs
; CHECK-NOT: ext.16b
@@ -786,8 +834,24 @@
ret <2 x i64> %res
}
+define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl_duplhs:
+; CHECK-NOT: ext.16b
+; CHECK: ssubl.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
+; CHECK-LABEL: ssubl2_duplhs:
; CHECK-NOT: ext.16b
; CHECK: ssubl2.2d
%lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
index f70ed9a..6d795db 100644
--- a/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1338,6 +1338,19 @@
ret <4 x i32> %vmull2.i
}
+define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6a:
+; CHECK-NEXT: smull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo7:
; CHECK-NEXT: smull2.2d v0, v1, v2[1]
@@ -1351,6 +1364,20 @@
ret <2 x i64> %vmull2.i
}
+define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7a:
+; CHECK-NEXT: smull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+
define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo8:
; CHECK-NEXT: umull2.4s v0, v1, v2[1]
@@ -1364,6 +1391,19 @@
ret <4 x i32> %vmull2.i
}
+define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8a:
+; CHECK-NEXT: umull.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: foo9:
; CHECK-NEXT: umull2.2d v0, v1, v2[1]
@@ -1377,6 +1417,19 @@
ret <2 x i64> %vmull2.i
}
+define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9a:
+; CHECK-NEXT: umull.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
; CHECK-LABEL: bar0:
; CHECK: smlal2.8h v0, v1, v2
@@ -1667,6 +1720,24 @@
ret <2 x i64> %vmull2.i
}
+define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_low_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s
+; CHECK-NEXT: ret
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
entry:
; CHECK: vmull_high_n_s16_test
@@ -1804,8 +1875,21 @@
ret <2 x i64> %sum
}
-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
+define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
%rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@@ -1817,8 +1901,21 @@
ret <2 x i64> %res
}
-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
+define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+ %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+ %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
@@ -1830,8 +1927,20 @@
ret <8 x i16> %res
}
-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
+define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: pmull.8h
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: pmull2.8h
@@ -1842,8 +1951,20 @@
ret <8 x i16> %res
}
-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
+define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmull.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmull2.2d
@@ -1854,8 +1975,21 @@
ret <2 x i64> %res
}
-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
+define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: sqdmlal.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: sqdmlal2.2d
@@ -1867,8 +2001,21 @@
ret <2 x i64> %sum
}
-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
+define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_low:
+; CHECK-NOT: ext
+; CHECK: umlal.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = add <2 x i64> %accum, %res
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane_high:
; CHECK-NOT: ext
; CHECK: umlal2.2d