Code quality: Combine V_RSQ Combine V_RCP and V_SQRT into V_RSQ on AMDGPU for GlobalISel. Change-Id: I93c5dcb412483156a6e8b68c4085cbce83ac9703

commit: ca57b80cd6767b97477fd157831a2b099b5f8f75 [log] [tgz]
author: Mateja Marjanovic <mateja.marjanovic@amd.com> Mon Sep 20 16:05:45 2021 +0200
committer: Mateja Marjanovic <mateja.marjanovic@amd.com> Tue Nov 30 17:17:15 2021 +0100
tree: 96007a9c75217268db1cbaed46fe1be406c996d6
parent: 3cc21ee6b966204b5d683e715f9e63ae9d63cbd4 [diff]
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 84aad98..a41166b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

@@ -526,6 +526,11 @@
   return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
 }
 
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT> m_GFSqrt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT>(Src);
+}
+
 // General helper for generic MI compares, i.e. G_ICMP and G_FCMP
 // TODO: Allow checking a specific predicate.
 template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index df2f9a0..c7c5ff7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td

@@ -26,6 +26,14 @@
          [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
   (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
 
+
+def rcp_sqrt_to_rsq : GICombineRule<
+  (defs root:$rcp, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp,
+         [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+
+
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
 
 def cvt_f32_ubyteN : GICombineRule<
@@ -86,7 +94,8 @@
 def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
   "AMDGPUGenPostLegalizerCombinerHelper",
   [all_combines, gfx6gfx7_combines,
-   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> {
+   uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
+   rcp_sqrt_to_rsq]> {
   let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
   let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
   let AdditionalArguments = [];

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 551ab7f..0528b55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

@@ -712,11 +712,6 @@
   (RcpInst $src)
 >;
 
-class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
-  (AMDGPUrcp (fsqrt vt:$src)),
-  (RsqInst $src)
->;
-
 // Instructions which select to the same v_min_f*
 def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
   [(fminnum_ieee node:$src0, node:$src1),

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index fc984d2..1479933 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
@@ -58,6 +59,9 @@
   bool matchUCharToFloat(MachineInstr &MI);
   void applyUCharToFloat(MachineInstr &MI);
 
+  bool matchRcpSqrtToRsq(MachineInstr &MI,
+                         std::function<void(MachineIRBuilder &)> &MatchInfo);
+
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
   struct CvtF32UByteMatchInfo {
@@ -203,6 +207,48 @@
   MI.eraseFromParent();
 }
 
+bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+
+  auto getRcpSrc = [=](const MachineInstr &MI) {
+    MachineInstr *ResMI = nullptr;
+    if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+        MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
+      ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+    return ResMI;
+  };
+
+  auto getSqrtSrc = [=](const MachineInstr &MI) {
+    MachineInstr *SqrtSrcMI = nullptr;
+    mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
+    return SqrtSrcMI;
+  };
+
+  MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
+  // rcp(sqrt(x))
+  if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
+    MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
+      B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+          .addUse(SqrtSrcMI->getOperand(0).getReg())
+          .setMIFlags(MI.getFlags());
+    };
+    return true;
+  }
+
+  // sqrt(rcp(x))
+  if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
+    MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
+      B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+          .addUse(RcpSrcMI->getOperand(0).getReg())
+          .setMIFlags(MI.getFlags());
+    };
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
   Register SrcReg = MI.getOperand(1).getReg();

diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index f4ddbf1..d18dab0 100644
--- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td

@@ -48,8 +48,6 @@
 def COS_cm : COS_Common<0x8E>;
 } // End isVector = 1
 
-def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
-
 def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
 
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;

diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 12224cb..a9a3421 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td

@@ -126,7 +126,6 @@
 def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
 def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
 
 def SIN_eg : SIN_Common<0x8D>;

diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index 4487864..b3da2fd 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td

@@ -1265,7 +1265,6 @@
   defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
   def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
 
-  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
   def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2872c44..d55d8da 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -827,10 +827,6 @@
 
 let OtherPredicates = [UnsafeFPMath] in {
 
-//defm : RsqPat<V_RSQ_F32_e32, f32>;
-
-def : RsqPat<V_RSQ_F32_e32, f32>;
-
 // Convert (x - floor(x)) to fract(x)
 def : GCNPat <
   (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
new file mode 100644
index 0000000..cf1747f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll

@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+define amdgpu_cs float @div_sqrt(float inreg %arg1) {
+; GCN-LABEL: div_sqrt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call float @llvm.sqrt.f32(float %arg1)
+  %b = fdiv afn float 1.000000e+00, %a
+  ret float %b
+}
+
+define amdgpu_cs float @sqrt_div(float inreg %arg1) {
+; GCN-LABEL: sqrt_div:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = fdiv afn float 1.000000e+00, %arg1
+  %b = call float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+define amdgpu_cs float @rcp_sqrt(float inreg %arg1) {
+; GCN-LABEL: rcp_sqrt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call float @llvm.sqrt.f32(float %arg1)
+  %b = call float @llvm.amdgcn.rcp.f32(float %a)
+  ret float %b
+}
+
+define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
+; GCN-LABEL: sqrt_rcp:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call float @llvm.amdgcn.rcp.f32(float %arg1)
+  %b = call float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.amdgcn.rcp.f32(float)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
new file mode 100644
index 0000000..f85ddba
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir

@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            rcp_sqrt_test
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK: $vgpr0 = COPY %3
+    ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+    ; GCN-LABEL: name: rcp_sqrt_test
+    ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+    ; GCN: $vgpr0 = COPY [[INT]](s32)
+    ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %2:_(s32) = G_FSQRT %0:_
+    %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
+    $vgpr0 = COPY %3:_(s32)
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name:            sqrt_rcp_test
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: sqrt_rcp_test
+    ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+    ; GCN: $vgpr0 = COPY [[INT]](s32)
+    ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32)
+    %3:_(s32) = G_FSQRT %2:_
+    $vgpr0 = COPY %3:_(s32)
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
commit	ca57b80cd6767b97477fd157831a2b099b5f8f75	[log] [tgz]
author	Mateja Marjanovic <mateja.marjanovic@amd.com>	Mon Sep 20 16:05:45 2021 +0200
committer	Mateja Marjanovic <mateja.marjanovic@amd.com>	Tue Nov 30 17:17:15 2021 +0100
tree	96007a9c75217268db1cbaed46fe1be406c996d6
parent	3cc21ee6b966204b5d683e715f9e63ae9d63cbd4 [diff]