[AArch64] Optimize extending loads of small vectors
Reduces the total amount of loads and the amount of moves between SIMD
registers and general-purpose registers.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b..ac5d614 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23239,6 +23239,99 @@
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}
+// Helper function to optimize small vector load + extension patterns.
+// These patterns would otherwise be scalarized into inefficient sequences.
+static SDValue performSmallVectorLoadExtCombine(SDNode *N, SelectionDAG &DAG) {
+ // Don't optimize if NEON is not available. Without NEON, the backend
+ // will need to scalarize these operations anyway.
+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isNeonAvailable())
+ return SDValue();
+ // Don't optimize if SVE is being used for fixed-length vectors, because it
+ // has native support for these patterns.
+ if (Subtarget.useSVEForFixedLengthVectors())
+ return SDValue();
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::ZERO_EXTEND && Opcode != ISD::SIGN_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() != ISD::LOAD)
+ return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD || !LD->hasOneUse() ||
+ LD->isVolatile())
+ return SDValue();
+
+ EVT MemVT = LD->getMemoryVT();
+ EVT ResVT = N->getValueType(0);
+ // Check if this is a small vector pattern we want to optimize.
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v2i16)
+ return SDValue();
+
+ unsigned NumElts = MemVT.getVectorNumElements();
+ unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
+ unsigned LoadBits = NumElts * SrcEltBits;
+
+ // Check alignment: the optimization loads a larger scalar, which may be
+ // unaligned, compared to what the original load will be legalized into.
+ Align Alignment = LD->getAlign();
+ if (Subtarget.requiresStrictAlign() && Alignment < LoadBits)
+ return SDValue();
+
+ // The transformation strategy:
+ // 1. Load the memory as a large scalar and turn it into a 64-bit vector.
+ // 2. Bitcast to a narrow type (v8i8 or v4i16) that has efficient NEON extend.
+ // 3. Extend using ushll/sshll, extract subvector, repeat as needed.
+
+ // For ANY_EXTEND, we can choose either sign or zero extend - zero is
+ // typically cheaper.
+ if (Opcode == ISD::ANY_EXTEND)
+ Opcode = ISD::ZERO_EXTEND;
+
+ SDLoc DL(N);
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ const MachinePointerInfo &PtrInfo = LD->getPointerInfo();
+ MVT LoadTy = MVT::getIntegerVT(LoadBits);
+ SDValue Load = DAG.getLoad(LoadTy, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+ // SCALAR_TO_VECTOR needs to create a 64-bit vector for NEON instructions.
+ // The scalar load is inserted into the lower bits of a 64-bit register.
+ // We determine the appropriate 64-bit vector type based on load size,
+ // then bitcast to v8i8 or v4i16 for efficient ushll/sshll extends.
+ MVT ScalarVecVT = MVT::getVectorVT(LoadTy, 64 / LoadBits);
+ MVT NarrowVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+ 64 / MemVT.getScalarSizeInBits());
+
+ SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarVecVT, Load);
+ Vec = DAG.getNode(ISD::BITCAST, DL, NarrowVT, Vec);
+ // Extend iteratively: each extend doubles the element size.
+ // We extend the full 64-bit vector to leverage NEON ushll/sshll instructions.
+ while (Vec.getScalarValueSizeInBits() < DstEltBits) {
+ MVT CurVT = Vec.getSimpleValueType();
+ unsigned NextBits = CurVT.getScalarSizeInBits() * 2;
+ MVT WideVT = MVT::getVectorVT(MVT::getIntegerVT(NextBits),
+ CurVT.getVectorNumElements());
+ Vec = DAG.getNode(Opcode, DL, WideVT, Vec);
+
+ // Extract only when: excess elements + still wide + done extending.
+ bool HasExcess = WideVT.getVectorNumElements() > NumElts;
+ bool StaysWide = WideVT.getSizeInBits() >= 64;
+ bool IsDone = NextBits >= DstEltBits;
+ if (HasExcess && StaysWide && IsDone) {
+ MVT ExtractVT = MVT::getVectorVT(WideVT.getScalarType(), NumElts);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
+ return DAG.getMergeValues({Vec, Load.getValue(1)}, DL);
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -23288,6 +23381,12 @@
NewAnyExtend);
}
+ // Try to optimize small vector load + extension patterns
+
+ // Try to optimize small vector load + extension patterns
+ if (SDValue Result = performSmallVectorLoadExtCombine(N, DAG))
+ return Result;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5..bc0edc9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@
define <2 x i16> @test1(ptr %v2i16_ptr) {
; CHECK-LE-LABEL: test1:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #2
-; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #2
-; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@
define <2 x i8> @test3(ptr %v2i8_ptr) {
; CHECK-LE-LABEL: test3:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #1
-; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #1
-; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@
define <2 x i32> @fsext_v2i32(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i32:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i32:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@
define <2 x i16> @fsext_v2i16(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i16:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i16:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -497,3 +495,219 @@
%v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
ret <4 x i8> %v4i8
}
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i16> @sext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @sext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..2cd54d4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,23 +222,17 @@
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0]
-; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT: ldr s0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: fmov d0, x8
-; CHECK-NEON-NEXT: mov v0.d[1], x9
-; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0]
-; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT: ldr s0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: fmov d0, x8
-; CHECK-SVE-NEXT: mov v0.d[1], x9
-; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7..d646cfe 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,16 @@
define <2 x i16> @std_v2i8_v2i16(ptr %p) {
; CHECK-LABEL: std_v2i8_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrb w9, [x0, #3]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: shl v0.2s, v0.2s, #3
-; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: ldr h0, [x0, #2]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: mov h2, v0.h[0]
+; CHECK-NEXT: mov h3, v1.h[0]
+; CHECK-NEXT: mov v2.h[2], v0.h[1]
+; CHECK-NEXT: mov v3.h[2], v1.h[1]
+; CHECK-NEXT: shl v0.2s, v2.2s, #3
+; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret
%l1 = load <2 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 2
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e3..b138fa4 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@
define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
; CHECK-SD-LABEL: load_v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;
@@ -269,9 +269,8 @@
define <2 x i16> @load_v2i16(ptr %ptr) {
; CHECK-SD-LABEL: load_v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;