llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp - llvm-project - Git at Google

 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUCombinerHelper.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"

 using namespace llvm;
 using namespace MIPatternMatch;

 AMDGPUCombinerHelper::AMDGPUCombinerHelper(
     GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
     GISelValueTracking *VT, MachineDominatorTree *MDT, const LegalizerInfo *LI,
     const GCNSubtarget &STI)
     : CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI),
       TII(*STI.getInstrInfo()) {}

 LLVM_READNONE
 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case AMDGPU::G_FADD:
   case AMDGPU::G_FSUB:
   case AMDGPU::G_FMUL:
   case AMDGPU::G_FMA:
   case AMDGPU::G_FMAD:
   case AMDGPU::G_FMINNUM:
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINNUM_IEEE:
   case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
   case AMDGPU::G_FSIN:
   case AMDGPU::G_FPEXT:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_FPTRUNC:
   case AMDGPU::G_FRINT:
   case AMDGPU::G_FNEARBYINT:
   case AMDGPU::G_INTRINSIC_ROUND:
   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_AMDGPU_RCP_IFLAG:
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
     return true;
   case AMDGPU::G_INTRINSIC: {
     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_sin:
     case Intrinsic::amdgcn_fmul_legacy:
     case Intrinsic::amdgcn_fmed3:
     case Intrinsic::amdgcn_fma_legacy:
       return true;
     default:
       return false;
     }
   }
   default:
     return false;
   }
 }

 /// \p returns true if the operation will definitely need to use a 64-bit
 /// encoding, and thus will use a VOP3 encoding regardless of the source
 /// modifiers.
 LLVM_READONLY
 static bool opMustUseVOP3Encoding(const MachineInstr &MI,
                                   const MachineRegisterInfo &MRI) {
   return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
          MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
 }

 // Most FP instructions support source modifiers.
 LLVM_READONLY
 static bool hasSourceMods(const MachineInstr &MI) {
   if (!MI.memoperands().empty())
     return false;

   switch (MI.getOpcode()) {
   case AMDGPU::COPY:
   case AMDGPU::G_SELECT:
   case AMDGPU::G_FDIV:
   case AMDGPU::G_FREM:
   case TargetOpcode::INLINEASM:
   case TargetOpcode::INLINEASM_BR:
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
   case AMDGPU::G_BITCAST:
   case AMDGPU::G_ANYEXT:
   case AMDGPU::G_BUILD_VECTOR:
   case AMDGPU::G_BUILD_VECTOR_TRUNC:
   case AMDGPU::G_PHI:
     return false;
   case AMDGPU::G_INTRINSIC:
   case AMDGPU::G_INTRINSIC_CONVERGENT: {
     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_interp_p1:
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
     case Intrinsic::amdgcn_interp_p1_f16:
     case Intrinsic::amdgcn_interp_p2_f16:
     case Intrinsic::amdgcn_div_scale:
       return false;
     default:
       return true;
     }
   }
   default:
     return true;
   }
 }

 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
                                   unsigned CostThreshold = 4) {
   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
   // it is truly free to use a source modifier in all cases. If there are
   // multiple users but for each one will necessitate using VOP3, there will be
   // a code size increase. Try to avoid increasing code size unless we know it
   // will save on the instruction count.
   unsigned NumMayIncreaseSize = 0;
   Register Dst = MI.getOperand(0).getReg();
   for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
     if (!hasSourceMods(Use))
       return false;

     if (!opMustUseVOP3Encoding(Use, MRI)) {
       if (++NumMayIncreaseSize > CostThreshold)
         return false;
     }
   }
   return true;
 }

 static bool mayIgnoreSignedZero(MachineInstr &MI) {
   const TargetOptions &Options = MI.getMF()->getTarget().Options;
   return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
 }

 static bool isInv2Pi(const APFloat &APF) {
   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
   static const APFloat KF64(APFloat::IEEEdouble(),
                             APInt(64, 0x3fc45f306dc9c882));

   return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
          APF.bitwiseIsEqual(KF64);
 }

 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
 // additional cost to negate them.
 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
                                        MachineRegisterInfo &MRI) {
   std::optional<FPValueAndVReg> FPValReg;
   if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
     if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
       return true;

     const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
     if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
       return true;
   }
   return false;
 }

 static unsigned inverseMinMax(unsigned Opc) {
   switch (Opc) {
   case AMDGPU::G_FMAXNUM:
     return AMDGPU::G_FMINNUM;
   case AMDGPU::G_FMINNUM:
     return AMDGPU::G_FMAXNUM;
   case AMDGPU::G_FMAXNUM_IEEE:
     return AMDGPU::G_FMINNUM_IEEE;
   case AMDGPU::G_FMINNUM_IEEE:
     return AMDGPU::G_FMAXNUM_IEEE;
   case AMDGPU::G_FMAXIMUM:
     return AMDGPU::G_FMINIMUM;
   case AMDGPU::G_FMINIMUM:
     return AMDGPU::G_FMAXIMUM;
   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
     return AMDGPU::G_AMDGPU_FMIN_LEGACY;
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
     return AMDGPU::G_AMDGPU_FMAX_LEGACY;
   default:
     llvm_unreachable("invalid min/max opcode");
   }
 }

 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
                                              MachineInstr *&MatchInfo) const {
   Register Src = MI.getOperand(1).getReg();
   MatchInfo = MRI.getVRegDef(Src);

   // If the input has multiple uses and we can either fold the negate down, or
   // the other uses cannot, give up. This both prevents unprofitable
   // transformations and infinite loops: we won't repeatedly try to fold around
   // a negate that has no 'good' form.
   if (MRI.hasOneNonDBGUse(Src)) {
     if (allUsesHaveSourceMods(MI, MRI, 0))
       return false;
   } else {
     if (fnegFoldsIntoMI(*MatchInfo) &&
         (allUsesHaveSourceMods(MI, MRI) ||
          !allUsesHaveSourceMods(*MatchInfo, MRI)))
       return false;
   }

   switch (MatchInfo->getOpcode()) {
   case AMDGPU::G_FMINNUM:
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINNUM_IEEE:
   case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
     // 0 doesn't have a negated inline immediate.
     return !isConstantCostlierToNegate(*MatchInfo,
                                        MatchInfo->getOperand(2).getReg(), MRI);
   case AMDGPU::G_FADD:
   case AMDGPU::G_FSUB:
   case AMDGPU::G_FMA:
   case AMDGPU::G_FMAD:
     return mayIgnoreSignedZero(*MatchInfo);
   case AMDGPU::G_FMUL:
   case AMDGPU::G_FPEXT:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_FPTRUNC:
   case AMDGPU::G_FRINT:
   case AMDGPU::G_FNEARBYINT:
   case AMDGPU::G_INTRINSIC_ROUND:
   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
   case AMDGPU::G_FSIN:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_AMDGPU_RCP_IFLAG:
     return true;
   case AMDGPU::G_INTRINSIC:
   case AMDGPU::G_INTRINSIC_CONVERGENT: {
     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_sin:
     case Intrinsic::amdgcn_fmul_legacy:
     case Intrinsic::amdgcn_fmed3:
       return true;
     case Intrinsic::amdgcn_fma_legacy:
       return mayIgnoreSignedZero(*MatchInfo);
     default:
       return false;
     }
   }
   default:
     return false;
   }
 }

 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
                                              MachineInstr *&MatchInfo) const {
   // Transform:
   // %A = inst %Op1, ...
   // %B = fneg %A
   //
   // into:
   //
   // (if %A has one use, specifically fneg above)
   // %B = inst (maybe fneg %Op1), ...
   //
   // (if %A has multiple uses)
   // %B = inst (maybe fneg %Op1), ...
   // %A = fneg %B

   // Replace register in operand with a register holding negated value.
   auto NegateOperand = [&](MachineOperand &Op) {
     Register Reg = Op.getReg();
     if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
       Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
     replaceRegOpWith(MRI, Op, Reg);
   };

   // Replace either register in operands with a register holding negated value.
   auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
     Register XReg = X.getReg();
     Register YReg = Y.getReg();
     if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
       replaceRegOpWith(MRI, X, XReg);
     else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
       replaceRegOpWith(MRI, Y, YReg);
     else {
       YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
       replaceRegOpWith(MRI, Y, YReg);
     }
   };

   Builder.setInstrAndDebugLoc(*MatchInfo);

   // Negate appropriate operands so that resulting value of MatchInfo is
   // negated.
   switch (MatchInfo->getOpcode()) {
   case AMDGPU::G_FADD:
   case AMDGPU::G_FSUB:
     NegateOperand(MatchInfo->getOperand(1));
     NegateOperand(MatchInfo->getOperand(2));
     break;
   case AMDGPU::G_FMUL:
     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
     break;
   case AMDGPU::G_FMINNUM:
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINNUM_IEEE:
   case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
   case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
     NegateOperand(MatchInfo->getOperand(1));
     NegateOperand(MatchInfo->getOperand(2));
     unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
     replaceOpcodeWith(*MatchInfo, Opposite);
     break;
   }
   case AMDGPU::G_FMA:
   case AMDGPU::G_FMAD:
     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
     NegateOperand(MatchInfo->getOperand(3));
     break;
   case AMDGPU::G_FPEXT:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_FRINT:
   case AMDGPU::G_FNEARBYINT:
   case AMDGPU::G_INTRINSIC_ROUND:
   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
   case AMDGPU::G_FSIN:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_AMDGPU_RCP_IFLAG:
   case AMDGPU::G_FPTRUNC:
     NegateOperand(MatchInfo->getOperand(1));
     break;
   case AMDGPU::G_INTRINSIC:
   case AMDGPU::G_INTRINSIC_CONVERGENT: {
     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_sin:
       NegateOperand(MatchInfo->getOperand(2));
       break;
     case Intrinsic::amdgcn_fmul_legacy:
       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
       break;
     case Intrinsic::amdgcn_fmed3:
       NegateOperand(MatchInfo->getOperand(2));
       NegateOperand(MatchInfo->getOperand(3));
       NegateOperand(MatchInfo->getOperand(4));
       break;
     case Intrinsic::amdgcn_fma_legacy:
       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
       NegateOperand(MatchInfo->getOperand(4));
       break;
     default:
       llvm_unreachable("folding fneg not supported for this intrinsic");
     }
     break;
   }
   default:
     llvm_unreachable("folding fneg not supported for this instruction");
   }

   Register Dst = MI.getOperand(0).getReg();
   Register MatchInfoDst = MatchInfo->getOperand(0).getReg();

   if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
     // MatchInfo now has negated value so use that instead of old Dst.
     replaceRegWith(MRI, Dst, MatchInfoDst);
   } else {
     // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
     // but replaceRegWith will replace defs as well. It is easier to replace one
     // def with a new register.
     LLT Type = MRI.getType(Dst);
     Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
     replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);

     // MatchInfo now has negated value so use that instead of old Dst.
     replaceRegWith(MRI, Dst, NegatedMatchInfo);

     // Recreate non negated value for other uses of old MatchInfoDst
     auto NextInst = ++MatchInfo->getIterator();
     Builder.setInstrAndDebugLoc(*NextInst);
     Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
   }

   MI.eraseFromParent();
 }

 // TODO: Should return converted value / extension source and avoid introducing
 // intermediate fptruncs in the apply function.
 static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
                                   Register Reg) {
   const MachineInstr *Def = MRI.getVRegDef(Reg);
   if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
     Register SrcReg = Def->getOperand(1).getReg();
     return MRI.getType(SrcReg) == LLT::scalar(16);
   }

   if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
     APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
     bool LosesInfo = true;
     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
     return !LosesInfo;
   }

   return false;
 }

 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
                                                        Register Src2) const {
   assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
   Register SrcReg = MI.getOperand(1).getReg();
   if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
     return false;

   return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
          isFPExtFromF16OrConst(MRI, Src2);
 }

 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
                                                        Register Src2) const {
   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
   // sources.
   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
   Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
   Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);

   LLT Ty = MRI.getType(Src0);
   auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
   auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
   auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
   Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
   MI.eraseFromParent();
 }

 bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
     MachineInstr &MI, MachineInstr &Sel,
     std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FMUL);
   assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
   assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());

   Register Dst = MI.getOperand(0).getReg();
   LLT DestTy = MRI.getType(Dst);
   LLT ScalarDestTy = DestTy.getScalarType();

   if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() &&
        ScalarDestTy != LLT::float16()) ||
       !MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg()))
     return false;

   Register SelectCondReg = Sel.getOperand(1).getReg();
   MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg());
   MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg());

   const auto SelectTrueVal =
       isConstantOrConstantSplatVectorFP(*SelectTrue, MRI);
   if (!SelectTrueVal)
     return false;
   const auto SelectFalseVal =
       isConstantOrConstantSplatVectorFP(*SelectFalse, MRI);
   if (!SelectFalseVal)
     return false;

   if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative())
     return false;

   // For f32, only non-inline constants should be transformed.
   if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) &&
       TII.isInlineConstant(*SelectFalseVal))
     return false;

   int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs();
   if (SelectTrueLog2Val == INT_MIN)
     return false;
   int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs();
   if (SelectFalseLog2Val == INT_MIN)
     return false;

   MatchInfo = [=, &MI](MachineIRBuilder &Builder) {
     LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32));
     auto NewSel = Builder.buildSelect(
         IntDestTy, SelectCondReg,
         Builder.buildConstant(IntDestTy, SelectTrueLog2Val),
         Builder.buildConstant(IntDestTy, SelectFalseLog2Val));

     Register XReg = MI.getOperand(1).getReg();
     if (SelectTrueVal->isNegative()) {
       auto NegX =
           Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags());
       Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags());
     } else {
       Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags());
     }
   };

   return true;
 }
	//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUCombinerHelper.h"
	#include "GCNSubtarget.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
	#include "llvm/IR/IntrinsicsAMDGPU.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace llvm;
	using namespace MIPatternMatch;

	AMDGPUCombinerHelper::AMDGPUCombinerHelper(
	GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
	GISelValueTracking VT, MachineDominatorTree MDT, const LegalizerInfo *LI,
	const GCNSubtarget &STI)
	: CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI),
	TII(*STI.getInstrInfo()) {}

	LLVM_READNONE
	static bool fnegFoldsIntoMI(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case AMDGPU::G_FADD:
	case AMDGPU::G_FSUB:
	case AMDGPU::G_FMUL:
	case AMDGPU::G_FMA:
	case AMDGPU::G_FMAD:
	case AMDGPU::G_FMINNUM:
	case AMDGPU::G_FMAXNUM:
	case AMDGPU::G_FMINNUM_IEEE:
	case AMDGPU::G_FMAXNUM_IEEE:
	case AMDGPU::G_FMINIMUM:
	case AMDGPU::G_FMAXIMUM:
	case AMDGPU::G_FSIN:
	case AMDGPU::G_FPEXT:
	case AMDGPU::G_INTRINSIC_TRUNC:
	case AMDGPU::G_FPTRUNC:
	case AMDGPU::G_FRINT:
	case AMDGPU::G_FNEARBYINT:
	case AMDGPU::G_INTRINSIC_ROUND:
	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
	case AMDGPU::G_FCANONICALIZE:
	case AMDGPU::G_AMDGPU_RCP_IFLAG:
	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
	return true;
	case AMDGPU::G_INTRINSIC: {
	Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
	switch (IntrinsicID) {
	case Intrinsic::amdgcn_rcp:
	case Intrinsic::amdgcn_rcp_legacy:
	case Intrinsic::amdgcn_sin:
	case Intrinsic::amdgcn_fmul_legacy:
	case Intrinsic::amdgcn_fmed3:
	case Intrinsic::amdgcn_fma_legacy:
	return true;
	default:
	return false;
	}
	}
	default:
	return false;
	}
	}

	/// \p returns true if the operation will definitely need to use a 64-bit
	/// encoding, and thus will use a VOP3 encoding regardless of the source
	/// modifiers.
	LLVM_READONLY
	static bool opMustUseVOP3Encoding(const MachineInstr &MI,
	const MachineRegisterInfo &MRI) {
	return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) \|\|
	MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
	}

	// Most FP instructions support source modifiers.
	LLVM_READONLY
	static bool hasSourceMods(const MachineInstr &MI) {
	if (!MI.memoperands().empty())
	return false;

	switch (MI.getOpcode()) {
	case AMDGPU::COPY:
	case AMDGPU::G_SELECT:
	case AMDGPU::G_FDIV:
	case AMDGPU::G_FREM:
	case TargetOpcode::INLINEASM:
	case TargetOpcode::INLINEASM_BR:
	case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
	case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
	case AMDGPU::G_BITCAST:
	case AMDGPU::G_ANYEXT:
	case AMDGPU::G_BUILD_VECTOR:
	case AMDGPU::G_BUILD_VECTOR_TRUNC:
	case AMDGPU::G_PHI:
	return false;
	case AMDGPU::G_INTRINSIC:
	case AMDGPU::G_INTRINSIC_CONVERGENT: {
	Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
	switch (IntrinsicID) {
	case Intrinsic::amdgcn_interp_p1:
	case Intrinsic::amdgcn_interp_p2:
	case Intrinsic::amdgcn_interp_mov:
	case Intrinsic::amdgcn_interp_p1_f16:
	case Intrinsic::amdgcn_interp_p2_f16:
	case Intrinsic::amdgcn_div_scale:
	return false;
	default:
	return true;
	}
	}
	default:
	return true;
	}
	}

	static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
	unsigned CostThreshold = 4) {
	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
	// it is truly free to use a source modifier in all cases. If there are
	// multiple users but for each one will necessitate using VOP3, there will be
	// a code size increase. Try to avoid increasing code size unless we know it
	// will save on the instruction count.
	unsigned NumMayIncreaseSize = 0;
	Register Dst = MI.getOperand(0).getReg();
	for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
	if (!hasSourceMods(Use))
	return false;

	if (!opMustUseVOP3Encoding(Use, MRI)) {
	if (++NumMayIncreaseSize > CostThreshold)
	return false;
	}
	}
	return true;
	}

	static bool mayIgnoreSignedZero(MachineInstr &MI) {
	const TargetOptions &Options = MI.getMF()->getTarget().Options;
	return Options.NoSignedZerosFPMath \|\| MI.getFlag(MachineInstr::MIFlag::FmNsz);
	}

	static bool isInv2Pi(const APFloat &APF) {
	static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
	static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
	static const APFloat KF64(APFloat::IEEEdouble(),
	APInt(64, 0x3fc45f306dc9c882));

	return APF.bitwiseIsEqual(KF16) \|\| APF.bitwiseIsEqual(KF32) \|\|
	APF.bitwiseIsEqual(KF64);
	}

	// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
	// additional cost to negate them.
	static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
	MachineRegisterInfo &MRI) {
	std::optional<FPValueAndVReg> FPValReg;
	if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
	if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
	return true;

	const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
	if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
	return true;
	}
	return false;
	}

	static unsigned inverseMinMax(unsigned Opc) {
	switch (Opc) {
	case AMDGPU::G_FMAXNUM:
	return AMDGPU::G_FMINNUM;
	case AMDGPU::G_FMINNUM:
	return AMDGPU::G_FMAXNUM;
	case AMDGPU::G_FMAXNUM_IEEE:
	return AMDGPU::G_FMINNUM_IEEE;
	case AMDGPU::G_FMINNUM_IEEE:
	return AMDGPU::G_FMAXNUM_IEEE;
	case AMDGPU::G_FMAXIMUM:
	return AMDGPU::G_FMINIMUM;
	case AMDGPU::G_FMINIMUM:
	return AMDGPU::G_FMAXIMUM;
	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
	return AMDGPU::G_AMDGPU_FMIN_LEGACY;
	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
	return AMDGPU::G_AMDGPU_FMAX_LEGACY;
	default:
	llvm_unreachable("invalid min/max opcode");
	}
	}

	bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
	MachineInstr *&MatchInfo) const {
	Register Src = MI.getOperand(1).getReg();
	MatchInfo = MRI.getVRegDef(Src);

	// If the input has multiple uses and we can either fold the negate down, or
	// the other uses cannot, give up. This both prevents unprofitable
	// transformations and infinite loops: we won't repeatedly try to fold around
	// a negate that has no 'good' form.
	if (MRI.hasOneNonDBGUse(Src)) {
	if (allUsesHaveSourceMods(MI, MRI, 0))
	return false;
	} else {
	if (fnegFoldsIntoMI(*MatchInfo) &&
	(allUsesHaveSourceMods(MI, MRI) \|\|
	!allUsesHaveSourceMods(*MatchInfo, MRI)))
	return false;
	}

	switch (MatchInfo->getOpcode()) {
	case AMDGPU::G_FMINNUM:
	case AMDGPU::G_FMAXNUM:
	case AMDGPU::G_FMINNUM_IEEE:
	case AMDGPU::G_FMAXNUM_IEEE:
	case AMDGPU::G_FMINIMUM:
	case AMDGPU::G_FMAXIMUM:
	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
	case AMDGPU::G_AMDGPU_FMAX_LEGACY:
	// 0 doesn't have a negated inline immediate.
	return !isConstantCostlierToNegate(*MatchInfo,
	MatchInfo->getOperand(2).getReg(), MRI);
	case AMDGPU::G_FADD:
	case AMDGPU::G_FSUB:
	case AMDGPU::G_FMA:
	case AMDGPU::G_FMAD:
	return mayIgnoreSignedZero(*MatchInfo);
	case AMDGPU::G_FMUL:
	case AMDGPU::G_FPEXT:
	case AMDGPU::G_INTRINSIC_TRUNC:
	case AMDGPU::G_FPTRUNC:
	case AMDGPU::G_FRINT:
	case AMDGPU::G_FNEARBYINT:
	case AMDGPU::G_INTRINSIC_ROUND:
	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
	case AMDGPU::G_FSIN:
	case AMDGPU::G_FCANONICALIZE:
	case AMDGPU::G_AMDGPU_RCP_IFLAG:
	return true;
	case AMDGPU::G_INTRINSIC:
	case AMDGPU::G_INTRINSIC_CONVERGENT: {
	Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
	switch (IntrinsicID) {
	case Intrinsic::amdgcn_rcp:
	case Intrinsic::amdgcn_rcp_legacy:
	case Intrinsic::amdgcn_sin:
	case Intrinsic::amdgcn_fmul_legacy:
	case Intrinsic::amdgcn_fmed3:
	return true;
	case Intrinsic::amdgcn_fma_legacy:
	return mayIgnoreSignedZero(*MatchInfo);
	default:
	return false;
	}
	}
	default:
	return false;
	}
	}

	void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
	MachineInstr *&MatchInfo) const {
	// Transform:
	// %A = inst %Op1, ...
	// %B = fneg %A
	//
	// into:
	//
	// (if %A has one use, specifically fneg above)
	// %B = inst (maybe fneg %Op1), ...
	//
	// (if %A has multiple uses)
	// %B = inst (maybe fneg %Op1), ...
	// %A = fneg %B

	// Replace register in operand with a register holding negated value.
	auto NegateOperand = [&](MachineOperand &Op) {
	Register Reg = Op.getReg();
	if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
	Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
	replaceRegOpWith(MRI, Op, Reg);
	};

	// Replace either register in operands with a register holding negated value.
	auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
	Register XReg = X.getReg();
	Register YReg = Y.getReg();
	if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
	replaceRegOpWith(MRI, X, XReg);
	else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
	replaceRegOpWith(MRI, Y, YReg);
	else {
	YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
	replaceRegOpWith(MRI, Y, YReg);
	}
	};

	Builder.setInstrAndDebugLoc(*MatchInfo);

	// Negate appropriate operands so that resulting value of MatchInfo is
	// negated.
	switch (MatchInfo->getOpcode()) {
	case AMDGPU::G_FADD:
	case AMDGPU::G_FSUB:
	NegateOperand(MatchInfo->getOperand(1));
	NegateOperand(MatchInfo->getOperand(2));
	break;
	case AMDGPU::G_FMUL:
	NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
	break;
	case AMDGPU::G_FMINNUM:
	case AMDGPU::G_FMAXNUM:
	case AMDGPU::G_FMINNUM_IEEE:
	case AMDGPU::G_FMAXNUM_IEEE:
	case AMDGPU::G_FMINIMUM:
	case AMDGPU::G_FMAXIMUM:
	case AMDGPU::G_AMDGPU_FMIN_LEGACY:
	case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
	NegateOperand(MatchInfo->getOperand(1));
	NegateOperand(MatchInfo->getOperand(2));
	unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
	replaceOpcodeWith(*MatchInfo, Opposite);
	break;
	}
	case AMDGPU::G_FMA:
	case AMDGPU::G_FMAD:
	NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
	NegateOperand(MatchInfo->getOperand(3));
	break;
	case AMDGPU::G_FPEXT:
	case AMDGPU::G_INTRINSIC_TRUNC:
	case AMDGPU::G_FRINT:
	case AMDGPU::G_FNEARBYINT:
	case AMDGPU::G_INTRINSIC_ROUND:
	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
	case AMDGPU::G_FSIN:
	case AMDGPU::G_FCANONICALIZE:
	case AMDGPU::G_AMDGPU_RCP_IFLAG:
	case AMDGPU::G_FPTRUNC:
	NegateOperand(MatchInfo->getOperand(1));
	break;
	case AMDGPU::G_INTRINSIC:
	case AMDGPU::G_INTRINSIC_CONVERGENT: {
	Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
	switch (IntrinsicID) {
	case Intrinsic::amdgcn_rcp:
	case Intrinsic::amdgcn_rcp_legacy:
	case Intrinsic::amdgcn_sin:
	NegateOperand(MatchInfo->getOperand(2));
	break;
	case Intrinsic::amdgcn_fmul_legacy:
	NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
	break;
	case Intrinsic::amdgcn_fmed3:
	NegateOperand(MatchInfo->getOperand(2));
	NegateOperand(MatchInfo->getOperand(3));
	NegateOperand(MatchInfo->getOperand(4));
	break;
	case Intrinsic::amdgcn_fma_legacy:
	NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
	NegateOperand(MatchInfo->getOperand(4));
	break;
	default:
	llvm_unreachable("folding fneg not supported for this intrinsic");
	}
	break;
	}
	default:
	llvm_unreachable("folding fneg not supported for this instruction");
	}

	Register Dst = MI.getOperand(0).getReg();
	Register MatchInfoDst = MatchInfo->getOperand(0).getReg();

	if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
	// MatchInfo now has negated value so use that instead of old Dst.
	replaceRegWith(MRI, Dst, MatchInfoDst);
	} else {
	// We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
	// but replaceRegWith will replace defs as well. It is easier to replace one
	// def with a new register.
	LLT Type = MRI.getType(Dst);
	Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
	replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);

	// MatchInfo now has negated value so use that instead of old Dst.
	replaceRegWith(MRI, Dst, NegatedMatchInfo);

	// Recreate non negated value for other uses of old MatchInfoDst
	auto NextInst = ++MatchInfo->getIterator();
	Builder.setInstrAndDebugLoc(*NextInst);
	Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
	}

	MI.eraseFromParent();
	}

	// TODO: Should return converted value / extension source and avoid introducing
	// intermediate fptruncs in the apply function.
	static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
	Register Reg) {
	const MachineInstr *Def = MRI.getVRegDef(Reg);
	if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
	Register SrcReg = Def->getOperand(1).getReg();
	return MRI.getType(SrcReg) == LLT::scalar(16);
	}

	if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
	APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
	bool LosesInfo = true;
	Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
	return !LosesInfo;
	}

	return false;
	}

	bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
	Register Src0,
	Register Src1,
	Register Src2) const {
	assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
	Register SrcReg = MI.getOperand(1).getReg();
	if (!MRI.hasOneNonDBGUse(SrcReg) \|\| MRI.getType(SrcReg) != LLT::scalar(32))
	return false;

	return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
	isFPExtFromF16OrConst(MRI, Src2);
	}

	void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
	Register Src0,
	Register Src1,
	Register Src2) const {
	// We expect fptrunc (fpext x) to fold out, and to constant fold any constant
	// sources.
	Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
	Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
	Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);

	LLT Ty = MRI.getType(Src0);
	auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
	auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
	auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
	Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
	MI.eraseFromParent();
	}

	bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
	MachineInstr &MI, MachineInstr &Sel,
	std::function<void(MachineIRBuilder &)> &MatchInfo) const {
	assert(MI.getOpcode() == TargetOpcode::G_FMUL);
	assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
	assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());

	Register Dst = MI.getOperand(0).getReg();
	LLT DestTy = MRI.getType(Dst);
	LLT ScalarDestTy = DestTy.getScalarType();

	if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() &&
	ScalarDestTy != LLT::float16()) \|\|
	!MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg()))
	return false;

	Register SelectCondReg = Sel.getOperand(1).getReg();
	MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg());
	MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg());

	const auto SelectTrueVal =
	isConstantOrConstantSplatVectorFP(*SelectTrue, MRI);
	if (!SelectTrueVal)
	return false;
	const auto SelectFalseVal =
	isConstantOrConstantSplatVectorFP(*SelectFalse, MRI);
	if (!SelectFalseVal)
	return false;

	if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative())
	return false;

	// For f32, only non-inline constants should be transformed.
	if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) &&
	TII.isInlineConstant(*SelectFalseVal))
	return false;

	int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs();
	if (SelectTrueLog2Val == INT_MIN)
	return false;
	int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs();
	if (SelectFalseLog2Val == INT_MIN)
	return false;

	MatchInfo = [=, &MI](MachineIRBuilder &Builder) {
	LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32));
	auto NewSel = Builder.buildSelect(
	IntDestTy, SelectCondReg,
	Builder.buildConstant(IntDestTy, SelectTrueLog2Val),
	Builder.buildConstant(IntDestTy, SelectFalseLog2Val));

	Register XReg = MI.getOperand(1).getReg();
	if (SelectTrueVal->isNegative()) {
	auto NegX =
	Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags());
	Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags());
	} else {
	Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags());
	}
	};

	return true;
	}