[AMDGPU] Propagate AGPR RC from PHI to its PHI operands We can fix register class of PHI based on its all AGPR uses. That leaves behind all PHIs which were already processed earlier. Propagate RC back to PHI operands of a PHI. Differential Revision: https://reviews.llvm.org/D77344

commit: 0462795095e57d9cac3c68ff1f28bc4bef8c4865 [log] [tgz]
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Thu Apr 02 16:06:45 2020 -0700
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Fri Apr 03 11:23:02 2020 -0700
tree: 5328f7268dd6abeba143651b3765ec75007dbbb2
parent: b4b7c989d6f1f255b49c34caf38748062d5402cc [diff]
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f067e96..7283c33 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

@@ -766,6 +766,7 @@
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
   SmallSet<const MachineInstr *, 4> Visited;
+  SetVector<MachineInstr *> PHIOperands;
   worklist.insert(&MI);
   Visited.insert(&MI);
   while (!worklist.empty()) {
@@ -810,6 +811,11 @@
   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+    for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+      MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
+      if (DefMI && DefMI->isPHI())
+        PHIOperands.insert(DefMI);
+    }
   }
 
   bool hasVGPRInput = false;
@@ -845,4 +851,8 @@
     TII->legalizeOperands(MI, MDT);
   }
 
+  // Propagate register class back to PHI operands which are PHI themselves.
+  while (!PHIOperands.empty()) {
+    processPHINode(*PHIOperands.pop_back_val());
+  }
 }

diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index b66c9d4..9b5438c 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll

@@ -487,5 +487,50 @@
   ret void
 }
 
+; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+
+; Check that we do not copy agprs to vgprs and back in an outer loop.
+
+; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: [[INNER_LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[INNER_LOOP]]
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
+
+; Final result should be read only once after the loop.
+
+; GCN-COUNT-32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
+  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
+  br label %inner.for.cond.preheader
+
+inner.for.cond.preheader:
+  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
+  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
+
+inner.exit:
+  %inc.0 = add nuw nsw i32 %c.0, 1
+  %cc.0 = icmp eq i32 %inc.0, 16
+  br i1 %cc.0, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
commit	0462795095e57d9cac3c68ff1f28bc4bef8c4865	[log] [tgz]
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Thu Apr 02 16:06:45 2020 -0700
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Fri Apr 03 11:23:02 2020 -0700
tree	5328f7268dd6abeba143651b3765ec75007dbbb2
parent	b4b7c989d6f1f255b49c34caf38748062d5402cc [diff]