[AMDGPU] Enable copy between VGPR and AGPR classes during regalloc

Greedy register allocator prefers to move a constrained
live range into a larger allocatable class over spilling
them. This patch defines the necessary superclasses for
vector registers. For subtargets that support copy between
VGPRs and AGPRs, the vector register spills during regalloc
now become just copies.

Reviewed By: rampitec, arsenm

Differential Revision: https://reviews.llvm.org/D109301
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 88996f4..92f5322 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1435,6 +1435,7 @@
       FrameInfo.getObjectAlign(FrameIndex));
   unsigned SpillSize = TRI->getSpillSize(*RC);
 
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
@@ -1448,7 +1449,6 @@
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
     if (SrcReg.isVirtual() && SpillSize == 4) {
-      MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
     }
 
@@ -1467,6 +1467,17 @@
                                        : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
 
+  if (RI.isVectorSuperClass(RC)) {
+    // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
+    // equivalent VGPR register beforehand. Regalloc might want to introduce
+    // AV spills only to be relevant until rewriter at which they become
+    // either spills of VGPRs or AGPRs.
+    Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
+    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
+        .addReg(SrcReg, RegState::Kill);
+    SrcReg = TmpVReg;
+  }
+
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
     .addFrameIndex(FrameIndex)               // addr
@@ -1600,11 +1611,24 @@
 
   unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
                                        : getVGPRSpillRestoreOpcode(SpillSize);
+
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
+  Register TmpReg = DestReg;
+  if (IsVectorSuperClass) {
+    // For AV classes, insert the spill restore to a VGPR followed by a copy
+    // into an equivalent AV register.
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
+  }
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // vaddr
     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
     .addImm(0)                           // offset
     .addMemOperand(MMO);
+
+  if (IsVectorSuperClass)
+    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
+        .addReg(DestReg, RegState::Kill);
 }
 
 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2b72c0b..a1d9a23 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -402,6 +402,62 @@
   return CSR_AMDGPU_NoRegs_RegMask;
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                          const MachineFunction &MF) const {
+  // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
+  // equivalent AV class. If used one, the verifier will crash after
+  // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
+  // until Instruction selection.
+  if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() &&
+      (isVGPRClass(RC) || isAGPRClass(RC))) {
+    if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
+      return &AMDGPU::AV_32RegClass;
+    if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
+      return &AMDGPU::AV_64RegClass;
+    if (RC == &AMDGPU::VReg_64_Align2RegClass ||
+        RC == &AMDGPU::AReg_64_Align2RegClass)
+      return &AMDGPU::AV_64_Align2RegClass;
+    if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
+      return &AMDGPU::AV_96RegClass;
+    if (RC == &AMDGPU::VReg_96_Align2RegClass ||
+        RC == &AMDGPU::AReg_96_Align2RegClass)
+      return &AMDGPU::AV_96_Align2RegClass;
+    if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
+      return &AMDGPU::AV_128RegClass;
+    if (RC == &AMDGPU::VReg_128_Align2RegClass ||
+        RC == &AMDGPU::AReg_128_Align2RegClass)
+      return &AMDGPU::AV_128_Align2RegClass;
+    if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
+      return &AMDGPU::AV_160RegClass;
+    if (RC == &AMDGPU::VReg_160_Align2RegClass ||
+        RC == &AMDGPU::AReg_160_Align2RegClass)
+      return &AMDGPU::AV_160_Align2RegClass;
+    if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
+      return &AMDGPU::AV_192RegClass;
+    if (RC == &AMDGPU::VReg_192_Align2RegClass ||
+        RC == &AMDGPU::AReg_192_Align2RegClass)
+      return &AMDGPU::AV_192_Align2RegClass;
+    if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
+      return &AMDGPU::AV_256RegClass;
+    if (RC == &AMDGPU::VReg_256_Align2RegClass ||
+        RC == &AMDGPU::AReg_256_Align2RegClass)
+      return &AMDGPU::AV_256_Align2RegClass;
+    if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
+      return &AMDGPU::AV_512RegClass;
+    if (RC == &AMDGPU::VReg_512_Align2RegClass ||
+        RC == &AMDGPU::AReg_512_Align2RegClass)
+      return &AMDGPU::AV_512_Align2RegClass;
+    if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
+      return &AMDGPU::AV_1024RegClass;
+    if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
+        RC == &AMDGPU::AReg_1024_Align2RegClass)
+      return &AMDGPU::AV_1024_Align2RegClass;
+  }
+
+  return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SIFrameLowering *TFI =
       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
@@ -994,10 +1050,22 @@
 
   unsigned Dst = IsStore ? Reg : ValueReg;
   unsigned Src = IsStore ? ValueReg : Reg;
-  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
-                                                   : AMDGPU::V_ACCVGPR_READ_B32_e64;
+  bool IsVGPR = TRI->isVGPR(MRI, Reg);
+  DebugLoc DL = MI->getDebugLoc();
+  if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
+    // Spiller during regalloc may restore a spilled register to its superclass.
+    // It could result in AGPR spills restored to VGPRs or the other way around,
+    // making the src and dst with identical regclasses at this point. It just
+    // needs a copy in such cases.
+    auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
+                       .addReg(Src, getKillRegState(IsKill));
+    CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
+    return CopyMIB;
+  }
+  unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
+                                    : AMDGPU::V_ACCVGPR_READ_B32_e64;
 
-  auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+  auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
                  .addReg(Src, getKillRegState(IsKill));
   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
   return MIB;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 49a2814..a6321f5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -77,6 +77,10 @@
     return 100;
   }
 
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
   Register getFrameRegister(const MachineFunction &MF) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
index 6a855c2..e419a87 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
@@ -20,7 +20,7 @@
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
-  ; CHECK:   SI_SPILL_V1024_SAVE [[DEF1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; CHECK:   [[COPY:%[0-9]+]]:av_1024_align2 = COPY [[DEF1]]
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK:   S_NOP 0, implicit [[DEF1]]
@@ -29,18 +29,17 @@
   ; CHECK:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.3(0x80000000)
-  ; CHECK:   [[SI_SPILL_V1024_RESTORE:%[0-9]+]]:vreg_1024_align2 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
-  ; CHECK:   undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK:     internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
+  ; CHECK:   undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK:     internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
   ; CHECK:   }
-  ; CHECK:   %5.sub0:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   %5.sub0:av_1024_align2 = IMPLICIT_DEF
   ; CHECK:   S_NOP 0, implicit %5.sub0
   ; CHECK: bb.3:
   ; CHECK:   successors: %bb.4(0x80000000)
   ; CHECK:   S_NOP 0, implicit %5
   ; CHECK: bb.4:
   ; CHECK:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
-  ; CHECK:   [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   [[DEF2:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
   ; CHECK:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK: bb.5:
   ; CHECK:   undef %3.sub0:vreg_1024_align2 = COPY [[DEF]]
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
new file mode 100644
index 0000000..bcc67c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -0,0 +1,62 @@
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX90A %s
+
+; Partial reg copy and spill missed during regalloc handled later at frame lowering.
+define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
+  ; REGALLOC-GFX908-LABEL: name: partial_copy
+  ; REGALLOC-GFX908: bb.0 (%ir-block.0):
+  ; REGALLOC-GFX908:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def [[VREG_64:%[0-9]+]]
+  ; REGALLOC-GFX908:  SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
+  ; REGALLOC-GFX908:  [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64
+  ; REGALLOC-GFX908:  [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0
+  ; REGALLOC-GFX908:  GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64, [[SI_SPILL_V64_RESTORE]]
+  ; REGALLOC-GFX908:  [[COPY_A128_TO_V128:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
+  ; REGALLOC-GFX908:  GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64, [[COPY_A128_TO_V128]]
+  ;
+  ; PEI-GFX908-LABEL: name: partial_copy
+  ; PEI-GFX908: bb.0 (%ir-block.0):
+  ; PEI-GFX908:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908:  BUFFER_STORE_DWORD_OFFSET killed $vgpr0
+  ; PEI-GFX908:  $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
+  ; PEI-GFX908:  renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
+  ; PEI-GFX908:  $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
+  ; PEI-GFX908:  $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
+  ; PEI-GFX908:  GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
+  ; PEI-GFX908:  renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
+  ; PEI-GFX908:  GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ;
+  ; REGALLOC-GFX90A-LABEL: name: partial_copy
+  ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
+  ; REGALLOC-GFX90A:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def [[VREG_64:%[0-9]+]]
+  ; REGALLOC-GFX90A:  SI_SPILL_V64_SAVE [[VREG_64]], %stack.0
+  ; REGALLOC-GFX90A:  [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64
+  ; REGALLOC-GFX90A:  [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64_align2 = SI_SPILL_V64_RESTORE %stack.0
+  ; REGALLOC-GFX90A:  [[COPY_AV64:%[0-9]+]]:av_64_align2 = COPY [[SI_SPILL_V64_RESTORE]]
+  ; REGALLOC-GFX90A:  GLOBAL_STORE_DWORDX2 undef %15:vreg_64_align2, [[COPY_AV64]]
+  ; REGALLOC-GFX90A-NOT:  %{{[0-9]+}}:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]]
+  ; REGALLOC-GFX90A:  GLOBAL_STORE_DWORDX4 undef %17:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]]
+  ;
+  ; PEI-GFX90A-LABEL: name: partial_copy
+  ; PEI-GFX90A: bb.0 (%ir-block.0):
+  ; PEI-GFX90A:  INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX90A:  BUFFER_STORE_DWORD_OFFSET killed $vgpr0
+  ; PEI-GFX90A:  $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1
+  ; PEI-GFX90A:  renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64
+  ; PEI-GFX90A:  $vgpr0 = BUFFER_LOAD_DWORD_OFFSET
+  ; PEI-GFX90A:  $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4
+  ; PEI-GFX90A:  GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1
+  ; PEI-GFX90A:  GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $agpr0_agpr1_agpr2_agpr3
+  %v0 = call <4 x i32> asm sideeffect "; def $0", "=v" ()
+  %v1 = call <2 x i32> asm sideeffect "; def $0", "=v" ()
+  %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
+  store volatile <4 x i32> %v0, <4 x i32> addrspace(1)* undef
+  store volatile <2 x i32> %v1, <2 x i32> addrspace(1)* undef
+  store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 91965f6..da5cbde 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,23 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
-; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908-A2M,A2M %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX90A-A2M,A2M %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
 
 ; GCN-LABEL: {{^}}max_24regs_32a_used:
-; A2M-DAG:        s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; A2M-DAG:        s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN-DAG:        v_mfma_f32_16x16x1f32
-; GCN-DAG:        v_mfma_f32_16x16x1f32
-; A2V-NOT:        SCRATCH_RSRC
-; GFX908-A2M-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
-; A2V:            v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
-; GFX908-A2M:     buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX908-A2M:     buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX90A-NOT:     v_accvgpr_read_b32
-; GFX90A-A2M:     buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX90A-A2M:     buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
-; GFX90A-NOT:     v_accvgpr_write_b32
-; A2V:            ScratchSize: 0
+; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG:     v_mfma_f32_16x16x1f32
+; GCN-DAG:     v_mfma_f32_16x16x1f32
+; GCN-DAG:     v_accvgpr_read_b32
+; GCN-NOT:     buffer_store_dword
+; GCN-NOT:     buffer_load_dword
+; GFX908-NOT:  v_accvgpr_write_b32
+; GFX90A:      v_accvgpr_write_b32
+; GCN:         ScratchSize: 0
 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
 bb:
   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
@@ -39,16 +33,13 @@
 }
 
 ; GCN-LABEL: {{^}}max_12regs_13a_used:
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; A2V-NOT:    SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
-; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
-; A2V:        ScratchSize: 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
+; GCN-NOT: buffer_store_dword
+; GCN-NOT: buffer_load_dword
+; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; GCN:     ScratchSize: 0
 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
 bb:
   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
@@ -70,17 +61,13 @@
 }
 
 ; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; GFX908-A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GFX908-A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; A2V-NOT:    SCRATCH_RSRC
-
-; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
-; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
-; A2V: ScratchSize: 0
-
-; GFX908-A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX908-A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX908-A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
+; GCN-NOT: buffer_store_dword
+; GCN-NOT: buffer_load_dword
+; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; GCN:     ScratchSize: 0
 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
   %a1 = call <4 x i32> asm sideeffect "", "=a"()
   %a2 = call <4 x i32> asm sideeffect "", "=a"()
@@ -92,17 +79,14 @@
 }
 
 ; GCN-LABEL: {{^}}max_32regs_mfma32:
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; A2V-NOT:    SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
-; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A:     v_mfma_f32_32x32x1f32
-; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
-; GFX90A-NOT: v_accvgpr_write_b32
-; A2V:        ScratchSize: 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-NOT: buffer_store_dword
+; GCN:     v_accvgpr_read_b32
+; GCN:     v_mfma_f32_32x32x1f32
+; GCN-NOT: buffer_load_dword
+; GCN:     v_accvgpr_write_b32
+; GCN:     ScratchSize: 0
 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
 bb:
   %v = call i32 asm sideeffect "", "=a"()
@@ -116,6 +100,47 @@
   ret void
 }
 
+; Should spill agprs to memory for both gfx908 and gfx90a.
+; GCN-LABEL: {{^}}max_5regs_used_8a:
+; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+
+; GFX908-DAG:  v_accvgpr_read_b32 v1, a0 ; Reload Reuse
+; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-DAG:  v_accvgpr_read_b32 v1, a1 ; Reload Reuse
+; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-DAG:  v_accvgpr_read_b32 v1, a2 ; Reload Reuse
+; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-DAG:  v_accvgpr_read_b32 v1, a3 ; Reload Reuse
+; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+
+; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
+; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
+
+; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+
+; GCN-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
+; GCN-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
+; GCN-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
+; GCN-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
+
+; GCN: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
+; GCN: ScratchSize: 20
+define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v0 = call float asm sideeffect "; def $0", "=v"()
+  %a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+  %mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep
+  %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0)
+  store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep
+  store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef
+  call void asm sideeffect "; use $0", "v"(float %v0);
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
@@ -125,3 +150,4 @@
 attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
 attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
+attributes #4 = { nounwind "amdgpu-num-vgpr"="5" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
new file mode 100644
index 0000000..47349bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; Convert AV spills into VGPR spills by introducing appropriate copies in between.
+
+define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
+  ; GCN-LABEL: name: test_spill_av_class
+  ; GCN:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0
+  ; GCN-NEXT:   undef %23.sub0:av_64 = COPY %21.sub0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY %23
+  ; GCN-NEXT:   SI_SPILL_V64_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec
+  ; GCN:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:av_64 = COPY [[SI_SPILL_V64_RESTORE]]
+  ; GCN-NEXT:   undef %22.sub0:vreg_64 = COPY [[COPY3]].sub0
+  %v0 = call i32 asm sideeffect "; def $0", "=v"()
+  %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
+  %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
+  store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
+  call void asm sideeffect "; use $0", "v"(<2 x i32> %tmp);
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="5" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index 6664820..4c5e7ec 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -5,15 +5,14 @@
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GFX908-NOT: SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
+; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}}
 ; GFX900:     buffer_store_dword v{{[0-9]}},
 ; GFX900:     buffer_store_dword v{{[0-9]}},
 ; GFX900:     buffer_load_dword v{{[0-9]}},
 ; GFX900:     buffer_load_dword v{{[0-9]}},
 ; GFX908-NOT: buffer_
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
+; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]]
+; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]]
 
 ; GCN:    NumVgprs: 10
 ; GFX900: ScratchSize: 12
@@ -57,19 +56,19 @@
 }
 
 ; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
-; GFX908:     buffer_store_dword v{{[0-9]}},
+; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}}
+; GFX908-NOT: buffer_store_dword v{{[0-9]}},
 ; GFX908-NOT: buffer_
-; GFX908:     v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
-; GFX908:     buffer_load_dword v{{[0-9]}},
+; GFX908:     v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]]
+; GFX908:     v_accvgpr_read_b32 [[V_REG]], [[A_REG]]
 ; GFX908-NOT: buffer_
 
 ; GFX900:     couldn't allocate input reg for constraint 'a'
 
 ; GFX908: NumVgprs: 10
-; GFX908: ScratchSize: 8
+; GFX908: ScratchSize: 0
 ; GFX908: VGPRBlocks: 2
 ; GFX908: NumVGPRsForWavesPerEU: 10
 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
@@ -113,28 +112,28 @@
 ; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1
-; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
-; GFX900:     buffer_store_dword v{{[0-9]}},
 ; GCN-DAG:    buffer_store_dword v{{[0-9]}},
-; GFX900:     buffer_load_dword v{{[0-9]}},
+; GCN-DAG:    buffer_store_dword v{{[0-9]}},
+; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
 ; GCN-DAG:    buffer_load_dword v{{[0-9]}},
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
+; GCN-DAG:    buffer_load_dword v{{[0-9]}},
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
 
 ; GCN:    NumVgprs: 10
 ; GFX900: ScratchSize: 44
@@ -166,10 +165,10 @@
 ; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
 ; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
-; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
-; GCN-NOT:    a10
 ; GCN:        buffer_store_dword v{{[0-9]}},
+; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GCN-NOT:    a10
 
 ; GFX908: NumVgprs: 10
 ; GFX900: ScratchSize: 100
@@ -231,23 +230,20 @@
   ret void
 }
 
-; FIXME: adding an AReg_1024 register class for v32f32 and v32i32
-;        produces unnecessary copies and we still have some amount
-;        of conventional spilling.
-
 ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GFX908-FIXME-NOT: SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_write_b32 a0, v
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908: v_accvgpr_write_b32
+; GFX908:  global_load_
 ; GFX900:     buffer_store_dword v
 ; GFX900:     buffer_load_dword v
-; GFX908-FIXME-NOT: buffer_
+; GFX908-NOT: buffer_
 ; GFX908-DAG: v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
 ; GFX900: ScratchSize: 2052
-; GFX908-FIXME: ScratchSize: 0
+; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir
new file mode 100644
index 0000000..474e755
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir
@@ -0,0 +1,224 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-flat-scratch -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+
+# A spilled register can be restored to its superclass during regalloc.
+# As a result, we might see AGPR spills restored to VGPRs or the other way around.
+
+---
+name: partial_spill_a128_restore_to_v128_1_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr52, $vgpr53, $vgpr54, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+
+    ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_1_of_4
+    ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GCN: {{  $}}
+    ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
+    ; GCN: $vgpr51 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+...
+
+---
+name: partial_spill_a128_restore_to_v128_2_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr52, $vgpr53, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+
+    ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_2_of_4
+    ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GCN: {{  $}}
+    ; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; GCN: $vgpr51 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr50 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+...
+
+---
+name: partial_spill_a128_restore_to_v128_3_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr52, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+
+    ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_3_of_4
+    ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GCN: {{  $}}
+    ; GCN: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN: $vgpr51 = COPY $vgpr53, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr50 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr49 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51
+    ; GCN: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+...
+
+---
+name: full_spill_a128_restore_to_v128
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $agpr0_agpr1_agpr2_agpr3
+
+    ; GCN-LABEL: name: full_spill_a128_restore_to_v128
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; GCN: {{  $}}
+    ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $vgpr55 = COPY $vgpr0, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
+    ; GCN: $vgpr54 = COPY $vgpr1, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
+    ; GCN: $vgpr53 = COPY $vgpr2, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
+    ; GCN: $vgpr52 = COPY $vgpr3, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55
+    ; GCN: S_ENDPGM 0
+    SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $vgpr52_vgpr53_vgpr54_vgpr55 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0
+...
+
+---
+name: partial_spill_v128_restore_to_a128_1_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $agpr31, $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25
+
+    ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_1_of_4
+    ; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: {{  $}}
+    ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5)
+    ; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
+    SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
+...
+
+---
+name: partial_spill_v128_restore_to_a128_2_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25
+
+    ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_2_of_4
+    ; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: {{  $}}
+    ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr28 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
+    SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25
+...
+
+---
+name: partial_spill_v128_restore_to_a128_3_of_4
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24
+
+    ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_3_of_4
+    ; GCN: liveins: $agpr24, $agpr25, $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: {{  $}}
+    ; GCN: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN: $agpr29 = COPY $agpr25, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr28 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr27 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29
+    ; GCN: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24
+    SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24
+...
+
+---
+name: full_spill_v128_restore_to_a128
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+  hasSpilledVGPRs: true
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+    ; GCN-LABEL: name: full_spill_v128_restore_to_a128
+    ; GCN: liveins: $agpr4, $agpr5, $agpr6, $agpr7, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: {{  $}}
+    ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN: $agpr3 = COPY $agpr4, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $agpr2 = COPY $agpr5, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $agpr1 = COPY $agpr6, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GCN: $agpr0 = COPY $agpr7, implicit-def $agpr0_agpr1_agpr2_agpr3
+    ; GCN: S_ENDPGM 0
+    SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
+    $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0
+...