[AArch64] Use SVE to materialise some 128-bit vector constants (#159101)

There is no easy way to materialise some fixed-width vector constants
with 64-bit elements. This is because NEON's movi instruction is
restricted to setting all bits in a byte to the same value, i.e. 0xFF
can be encoded as an immediate but not 0x1F. However, if SVE is
available we can use the dup instruction to cover more cases.

Rather than lower the immediate directly using the dup instruction, I've
instead used the generic SPLAT_VECTOR node in combination with an
EXTRACT_SUBVECTOR. This is because we already have SVE splat_vector
patterns that can match directly to dup.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 54bdb87..6a1b06e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4346,34 +4346,14 @@
                     ->getAPIntValue()
                     .trunc(VT.getFixedSizeInBits())
                     .getSExtValue();
+  int32_t ImmVal, ShiftVal;
+  if (!AArch64_AM::isSVECpyDupImm(VT.getScalarSizeInBits(), Val, ImmVal,
+                                  ShiftVal))
+    return false;
 
-  switch (VT.SimpleTy) {
-  case MVT::i8:
-    // All immediates are supported.
-    Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-    Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
-    return true;
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-    // Support 8bit signed immediates.
-    if (Val >= -128 && Val <= 127) {
-      Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
-      return true;
-    }
-    // Support 16bit signed immediates that are a multiple of 256.
-    if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
-      Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
-      return true;
-    }
-    break;
-  default:
-    break;
-  }
-
-  return false;
+  Shift = CurDAG->getTargetConstant(ShiftVal, DL, MVT::i32);
+  Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+  return true;
 }
 
 bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 50f9da9..a4c1e26 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15270,6 +15270,27 @@
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
+static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG,
+                             const AArch64Subtarget *ST, APInt &DefBits) {
+  EVT VT = Op.getValueType();
+  // TODO: We should be able to support 64-bit destinations too
+  if (!ST->hasSVE() || !VT.is128BitVector() ||
+      DefBits.getHiBits(64) != DefBits.getLoBits(64))
+    return SDValue();
+
+  // See if we can make use of the SVE dup instruction.
+  APInt Val64 = DefBits.trunc(64);
+  int32_t ImmVal, ShiftVal;
+  if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
+                                        DAG.getConstant(Val64, DL, MVT::i64));
+  SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
+  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
+}
+
 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
                                    const AArch64Subtarget *ST) {
   EVT VT = Op.getValueType();
@@ -15309,6 +15330,10 @@
     if (SDValue R = TryMOVIWithBits(UndefBits))
       return R;
 
+    // Try to materialise the constant using SVE when available.
+    if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
+      return R;
+
     // See if a fneg of the constant can be materialized with a MOVI, etc
     auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
       // FNegate each sub-element of the constant
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index f542592..4ae5d04 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -871,6 +871,36 @@
   return isAnyMOVZMovAlias(Value, RegWidth);
 }
 
+static inline bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm,
+                                  int32_t &Shift) {
+  switch (SizeInBits) {
+  case 8:
+    // All immediates are supported.
+    Shift = 0;
+    Imm = Val & 0xFF;
+    return true;
+  case 16:
+  case 32:
+  case 64:
+    // Support 8bit signed immediates.
+    if (Val >= -128 && Val <= 127) {
+      Shift = 0;
+      Imm = Val & 0xFF;
+      return true;
+    }
+    // Support 16bit signed immediates that are a multiple of 256.
+    if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
+      Shift = 8;
+      Imm = (Val >> 8) & 0xFF;
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
 } // end namespace AArch64_AM
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
index 832e34b..f5cf629 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -75,10 +75,9 @@
 ; CHECK-LABEL: vector_loop_with_icmp:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEXT:    mov w9, #16 // =0x10
-; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    mov z1.d, #2 // =0x2
 ; CHECK-NEXT:    add x8, x0, #4
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    mov w10, #1 // =0x1
 ; CHECK-NEXT:    b .LBB5_2
 ; CHECK-NEXT:  .LBB5_1: // %pred.store.continue6
diff --git a/llvm/test/CodeGen/AArch64/movi64_sve.ll b/llvm/test/CodeGen/AArch64/movi64_sve.ll
new file mode 100644
index 0000000..1d4e00d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movi64_sve.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mtriple=aarch64 -mattr=+neon,+sve < %s | FileCheck %s --check-prefixes=COMMON,SVE
+
+define <2 x i64> @movi_1_v2i64() {
+; NEON-LABEL: movi_1_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov w8, #1 // =0x1
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_1_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #1 // =0x1
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 1)
+}
+
+define <2 x i64> @movi_127_v2i64() {
+; NEON-LABEL: movi_127_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov w8, #127 // =0x7f
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_127_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #127 // =0x7f
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 127)
+}
+
+define <2 x i64> @movi_m128_v2i64() {
+; NEON-LABEL: movi_m128_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov x8, #-128 // =0xffffffffffffff80
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_m128_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #-128 // =0xffffffffffffff80
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 -128)
+}
+
+define <2 x i64> @movi_256_v2i64() {
+; NEON-LABEL: movi_256_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov w8, #256 // =0x100
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_256_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #256 // =0x100
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 256)
+}
+
+define <2 x i64> @movi_32512_v2i64() {
+; NEON-LABEL: movi_32512_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov w8, #32512 // =0x7f00
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_32512_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #32512 // =0x7f00
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 32512)
+}
+
+define <2 x i64> @movi_m32768_v2i64() {
+; NEON-LABEL: movi_m32768_v2i64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov x8, #-32768 // =0xffffffffffff8000
+; NEON-NEXT:    dup v0.2d, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_m32768_v2i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #-32768 // =0xffffffffffff8000
+; SVE-NEXT:    ret
+  ret <2 x i64> splat (i64 -32768)
+}
+
+; Special cases where the destination vector does not have 64-bit elements
+
+define <4 x i32> @movi_v4i32_1() {
+; NEON-LABEL: movi_v4i32_1:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI6_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v4i32_1:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #127 // =0x7f
+; SVE-NEXT:    ret
+  ret <4 x i32> <i32 127, i32 0, i32 127, i32 0>
+}
+
+define <4 x i32> @movi_v4i32_2() {
+; NEON-LABEL: movi_v4i32_2:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI7_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v4i32_2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #32512 // =0x7f00
+; SVE-NEXT:    ret
+  ret <4 x i32> <i32 32512, i32 0, i32 32512, i32 0>
+}
+
+define <8 x i16> @movi_v8i16_1() {
+; NEON-LABEL: movi_v8i16_1:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI8_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v8i16_1:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #127 // =0x7f
+; SVE-NEXT:    ret
+  ret <8 x i16> <i16 127, i16 0, i16 0, i16 0, i16 127, i16 0, i16 0, i16 0>
+}
+
+define <8 x i16> @movi_v8i16_2() {
+; NEON-LABEL: movi_v8i16_2:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI9_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v8i16_2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #32512 // =0x7f00
+; SVE-NEXT:    ret
+  ret <8 x i16> <i16 32512, i16 0, i16 0, i16 0, i16 32512, i16 0, i16 0, i16 0>
+}
+
+define <16 x i8> @movi_v16i8_1() {
+; NEON-LABEL: movi_v16i8_1:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI10_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v16i8_1:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #127 // =0x7f
+; SVE-NEXT:    ret
+  ret <16 x i8> <i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <16 x i8> @movi_v16i8_2() {
+; NEON-LABEL: movi_v16i8_2:
+; NEON:       // %bb.0:
+; NEON-NEXT:    adrp x8, .LCPI11_0
+; NEON-NEXT:    ldr q0, [x8, :lo12:.LCPI11_0]
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: movi_v16i8_2:
+; SVE:       // %bb.0:
+; SVE-NEXT:    mov z0.d, #32512 // =0x7f00
+; SVE-NEXT:    ret
+  ret <16 x i8> <i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; Negative cases
+
+define <2 x i64> @movi_128_v2i64() {
+; COMMON-LABEL: movi_128_v2i64:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    mov w8, #128 // =0x80
+; COMMON-NEXT:    dup v0.2d, x8
+; COMMON-NEXT:    ret
+  ret <2 x i64> splat (i64 128)
+}
+
+define <2 x i64> @movi_m127_v2i64() {
+; COMMON-LABEL: movi_m127_v2i64:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    mov x8, #-129 // =0xffffffffffffff7f
+; COMMON-NEXT:    dup v0.2d, x8
+; COMMON-NEXT:    ret
+  ret <2 x i64> splat (i64 -129)
+}
+
+define <2 x i64> @movi_32513_v2i64() {
+; COMMON-LABEL: movi_32513_v2i64:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    mov w8, #32513 // =0x7f01
+; COMMON-NEXT:    dup v0.2d, x8
+; COMMON-NEXT:    ret
+  ret <2 x i64> splat (i64 32513)
+}
+
+define <2 x i64> @movi_m32769_v2i64() {
+; COMMON-LABEL: movi_m32769_v2i64:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    mov x8, #-32769 // =0xffffffffffff7fff
+; COMMON-NEXT:    dup v0.2d, x8
+; COMMON-NEXT:    ret
+  ret <2 x i64> splat (i64 -32769)
+}
+
+define <2 x i64> @movi_257_v2i64() {
+; COMMON-LABEL: movi_257_v2i64:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    mov w8, #257 // =0x101
+; COMMON-NEXT:    dup v0.2d, x8
+; COMMON-NEXT:    ret
+  ret <2 x i64> splat (i64 257)
+}
+
+define <4 x i32> @movi_v4i32_3() {
+; COMMON-LABEL: movi_v4i32_3:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    adrp x8, .LCPI17_0
+; COMMON-NEXT:    ldr q0, [x8, :lo12:.LCPI17_0]
+; COMMON-NEXT:    ret
+  ret <4 x i32> <i32 -128, i32 0, i32 -128, i32 0>
+}
+
+define <16 x i8> @movi_v16i8_3() {
+; COMMON-LABEL: movi_v16i8_3:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    adrp x8, .LCPI18_0
+; COMMON-NEXT:    ldr q0, [x8, :lo12:.LCPI18_0]
+; COMMON-NEXT:    ret
+  ret <16 x i8> <i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0>
+}