[AArch64][GlobalISel] Implement selection for G_UNMERGE of vectors to vectors.

This re-uses the previous support for extract vector elt to extract the
subvectors.

Differential Revision: https://reviews.llvm.org/D59390

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356213 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index e1f26ec..a2eb920 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -93,6 +93,8 @@
   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectSplitVectorUnmerge(MachineInstr &I,
+                                MachineRegisterInfo &MRI) const;
 
   unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
   MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
@@ -102,6 +104,10 @@
   MachineInstr *emitVectorConcat(Optional<unsigned> Dst, unsigned Op1,
                                  unsigned Op2,
                                  MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitExtractVectorElt(Optional<unsigned> DstReg,
+                                     const RegisterBank &DstRB, LLT ScalarTy,
+                                     unsigned VecReg, unsigned LaneIdx,
+                                     MachineIRBuilder &MIRBuilder) const;
 
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 
@@ -1870,6 +1876,68 @@
   return true;
 }
 
+MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
+    Optional<unsigned> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
+    unsigned VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  unsigned CopyOpc = 0;
+  unsigned ExtractSubReg = 0;
+  if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
+    LLVM_DEBUG(
+        dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
+    return nullptr;
+  }
+
+  const TargetRegisterClass *DstRC =
+      getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
+  if (!DstRC) {
+    LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
+    return nullptr;
+  }
+
+  const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
+  const LLT &VecTy = MRI.getType(VecReg);
+  const TargetRegisterClass *VecRC =
+      getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
+  if (!VecRC) {
+    LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
+    return nullptr;
+  }
+
+  // The register that we're going to copy into.
+  unsigned InsertReg = VecReg;
+  if (!DstReg)
+    DstReg = MRI.createVirtualRegister(DstRC);
+  // If the lane index is 0, we just use a subregister COPY.
+  if (LaneIdx == 0) {
+    auto CopyMI =
+        BuildMI(MIRBuilder.getMBB(), MIRBuilder.getInsertPt(),
+                MIRBuilder.getDL(), TII.get(TargetOpcode::COPY), *DstReg)
+            .addUse(VecReg, 0, ExtractSubReg);
+    RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
+    return &*CopyMI;
+  }
+
+  // Lane copies require 128-bit wide registers. If we're dealing with an
+  // unpacked vector, then we need to move up to that width. Insert an implicit
+  // def and a subregister insert to get us there.
+  if (VecTy.getSizeInBits() != 128) {
+    MachineInstr *ScalarToVector = emitScalarToVector(
+        VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
+    if (!ScalarToVector)
+      return nullptr;
+    InsertReg = ScalarToVector->getOperand(0).getReg();
+  }
+
+  MachineInstr *LaneCopyMI =
+      MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
+  constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
+
+  // Make sure that we actually constrain the initial copy.
+  RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
+  return LaneCopyMI;
+}
+
 bool AArch64InstructionSelector::selectExtractElt(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
@@ -1878,7 +1946,7 @@
   const LLT NarrowTy = MRI.getType(DstReg);
   const unsigned SrcReg = I.getOperand(1).getReg();
   const LLT WideTy = MRI.getType(SrcReg);
-
+  (void)WideTy;
   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
          "source register size too small!");
   assert(NarrowTy.isScalar() && "cannot extract vector into vector!");
@@ -1897,63 +1965,44 @@
   if (!getConstantValueForReg(LaneIdxOp.getReg(), MRI, LaneIdx))
     return false;
 
-  unsigned CopyOpc = 0;
-  unsigned ExtractSubReg = 0;
-  if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits())) {
-    LLVM_DEBUG(
-        dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
-    return false;
-  }
-
-  const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
-  const TargetRegisterClass *DstRC =
-      getRegClassForTypeOnBank(NarrowTy, DstRB, RBI, true);
-  if (!DstRC) {
-    LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
-    return false;
-  }
-
-  const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
-  const TargetRegisterClass *SrcRC =
-      getRegClassForTypeOnBank(WideTy, SrcRB, RBI, true);
-  if (!SrcRC) {
-    LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
-    return false;
-  }
-
-  // The register that we're going to copy into.
-  unsigned InsertReg = SrcReg;
   MachineIRBuilder MIRBuilder(I);
 
-  // If the lane index is 0, we just use a subregister COPY.
-  if (LaneIdx == 0) {
-    unsigned CopyTo = I.getOperand(0).getReg();
-    BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
-            CopyTo)
-        .addUse(SrcReg, 0, ExtractSubReg);
-    RBI.constrainGenericRegister(CopyTo, *DstRC, MRI);
-    I.eraseFromParent();
-    return true;
+  const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+  MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
+                                               LaneIdx, MIRBuilder);
+  if (!Extract)
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectSplitVectorUnmerge(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  unsigned NumElts = I.getNumOperands() - 1;
+  unsigned SrcReg = I.getOperand(NumElts).getReg();
+  const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
+  if (SrcTy.getSizeInBits() > 128) {
+    LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
+    return false;
   }
 
-  // Lane copies require 128-bit wide registers. If we're dealing with an
-  // unpacked vector, then we need to move up to that width. Insert an implicit
-  // def and a subregister insert to get us there.
-  if (WideTy.getSizeInBits() != 128) {
-    MachineInstr *ScalarToVector = emitScalarToVector(
-        WideTy.getSizeInBits(), &AArch64::FPR128RegClass, SrcReg, MIRBuilder);
-    if (!ScalarToVector)
+  MachineIRBuilder MIB(I);
+
+  // We implement a split vector operation by treating the sub-vectors as
+  // scalars and extracting them.
+  const RegisterBank &DstRB =
+      *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
+  for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
+    unsigned Dst = I.getOperand(OpIdx).getReg();
+    MachineInstr *Extract =
+        emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
+    if (!Extract)
       return false;
-    InsertReg = ScalarToVector->getOperand(0).getReg();
   }
-
-  MachineInstr *LaneCopyMI =
-      MIRBuilder.buildInstr(CopyOpc, {DstReg}, {InsertReg}).addImm(LaneIdx);
-  constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
-
-  // Make sure that we actually constrain the initial copy.
-  RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
-
   I.eraseFromParent();
   return true;
 }
@@ -1984,11 +2033,8 @@
   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
          "source register size too small!");
 
-  // TODO: Handle unmerging into vectors.
-  if (!NarrowTy.isScalar()) {
-    LLVM_DEBUG(dbgs() << "Vector-to-vector unmerges not supported yet.\n");
-    return false;
-  }
+  if (!NarrowTy.isScalar())
+    return selectSplitVectorUnmerge(I, MRI);
 
   // Choose a lane copy opcode and subregister based off of the size of the
   // vector's elements.
diff --git a/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir b/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
index 6814b99..fdc5f12 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
@@ -19,6 +19,14 @@
     ret <8 x half> %a
   }
 
+  define <2 x float> @test_vecsplit_2v2s32_v4s32(<4 x float> %a) {
+    ret <2 x float> undef
+  }
+
+  define <2 x half> @test_vecsplit_2v2s16_v4s16(<4 x half> %a) {
+    ret <2 x half> undef
+  }
+
 ...
 ---
 name:            test_v2s64_unmerge
@@ -152,3 +160,51 @@
     $q0 = COPY %1(<8 x s16>)
     RET_ReallyLR implicit $q0
 ...
+---
+name:            test_vecsplit_2v2s32_v4s32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0
+    ; CHECK-LABEL: name: test_vecsplit_2v2s32_v4s32
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[COPY]].dsub
+    ; CHECK: [[CPYi64_:%[0-9]+]]:fpr64 = CPYi64 [[COPY]], 1
+    ; CHECK: $d0 = COPY [[COPY1]]
+    ; CHECK: $d1 = COPY [[CPYi64_]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<4 x s32>) = COPY $q0
+    %1:fpr(<2 x s32>), %2:fpr(<2 x s32>) = G_UNMERGE_VALUES %0(<4 x s32>)
+    $d0 = COPY %1(<2 x s32>)
+    $d1 = COPY %2(<2 x s32>)
+    RET_ReallyLR implicit $d0
+...
+---
+name:            test_vecsplit_2v2s16_v4s16
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+    ; CHECK-LABEL: name: test_vecsplit_2v2s16_v4s16
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
+    ; CHECK: [[CPYi32_:%[0-9]+]]:fpr32 = CPYi32 [[INSERT_SUBREG]], 1
+    ; CHECK: $s0 = COPY [[COPY1]]
+    ; CHECK: $s1 = COPY [[CPYi32_]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(<4 x s16>) = COPY $d0
+    %1:fpr(<2 x s16>), %2:fpr(<2 x s16>) = G_UNMERGE_VALUES %0(<4 x s16>)
+    $s0 = COPY %1(<2 x s16>)
+    $s1 = COPY %2(<2 x s16>)
+    RET_ReallyLR implicit $s0
+...