| //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// This pass does misc. AMDGPU optimizations on IR before instruction |
| /// selection. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPU.h" |
| #include "AMDGPUSubtarget.h" |
| #include "AMDGPUTargetMachine.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/Analysis/DivergenceAnalysis.h" |
| #include "llvm/CodeGen/Passes.h" |
| #include "llvm/CodeGen/TargetPassConfig.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstVisitor.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/LLVMContext.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include <cassert> |
| #include <iterator> |
| |
| #define DEBUG_TYPE "amdgpu-codegenprepare" |
| |
| using namespace llvm; |
| |
| namespace { |
| |
| class AMDGPUCodeGenPrepare : public FunctionPass, |
| public InstVisitor<AMDGPUCodeGenPrepare, bool> { |
| const SISubtarget *ST = nullptr; |
| DivergenceAnalysis *DA = nullptr; |
| Module *Mod = nullptr; |
| bool HasUnsafeFPMath = false; |
| |
| /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to |
| /// binary operation \p V. |
| /// |
| /// \returns Binary operation \p V. |
| /// \returns \p T's base element bit width. |
| unsigned getBaseElementBitWidth(const Type *T) const; |
| |
| /// \returns Equivalent 32 bit integer type for given type \p T. For example, |
| /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> |
| /// is returned. |
| Type *getI32Ty(IRBuilder<> &B, const Type *T) const; |
| |
| /// \returns True if binary operation \p I is a signed binary operation, false |
| /// otherwise. |
| bool isSigned(const BinaryOperator &I) const; |
| |
| /// \returns True if the condition of 'select' operation \p I comes from a |
| /// signed 'icmp' operation, false otherwise. |
| bool isSigned(const SelectInst &I) const; |
| |
| /// \returns True if type \p T needs to be promoted to 32 bit integer type, |
| /// false otherwise. |
| bool needsPromotionToI32(const Type *T) const; |
| |
| /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary |
| /// operation. |
| /// |
| /// \details \p I's base element bit width must be greater than 1 and less |
| /// than or equal 16. Promotion is done by sign or zero extending operands to |
| /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and |
| /// truncating the result of 32 bit binary operation back to \p I's original |
| /// type. Division operation is not promoted. |
| /// |
| /// \returns True if \p I is promoted to equivalent 32 bit binary operation, |
| /// false otherwise. |
| bool promoteUniformOpToI32(BinaryOperator &I) const; |
| |
| /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. |
| /// |
| /// \details \p I's base element bit width must be greater than 1 and less |
| /// than or equal 16. Promotion is done by sign or zero extending operands to |
| /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. |
| /// |
| /// \returns True. |
| bool promoteUniformOpToI32(ICmpInst &I) const; |
| |
| /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' |
| /// operation. |
| /// |
| /// \details \p I's base element bit width must be greater than 1 and less |
| /// than or equal 16. Promotion is done by sign or zero extending operands to |
| /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the |
| /// result of 32 bit 'select' operation back to \p I's original type. |
| /// |
| /// \returns True. |
| bool promoteUniformOpToI32(SelectInst &I) const; |
| |
| /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' |
| /// intrinsic. |
| /// |
| /// \details \p I's base element bit width must be greater than 1 and less |
| /// than or equal 16. Promotion is done by zero extending the operand to 32 |
| /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the |
| /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the |
| /// shift amount is 32 minus \p I's base element bit width), and truncating |
| /// the result of the shift operation back to \p I's original type. |
| /// |
| /// \returns True. |
| bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; |
| |
| public: |
| static char ID; |
| |
| AMDGPUCodeGenPrepare() : FunctionPass(ID) {} |
| |
| bool visitFDiv(BinaryOperator &I); |
| |
| bool visitInstruction(Instruction &I) { return false; } |
| bool visitBinaryOperator(BinaryOperator &I); |
| bool visitICmpInst(ICmpInst &I); |
| bool visitSelectInst(SelectInst &I); |
| |
| bool visitIntrinsicInst(IntrinsicInst &I); |
| bool visitBitreverseIntrinsicInst(IntrinsicInst &I); |
| |
| bool doInitialization(Module &M) override; |
| bool runOnFunction(Function &F) override; |
| |
| StringRef getPassName() const override { return "AMDGPU IR optimizations"; } |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| AU.addRequired<DivergenceAnalysis>(); |
| AU.setPreservesAll(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { |
| assert(needsPromotionToI32(T) && "T does not need promotion to i32"); |
| |
| if (T->isIntegerTy()) |
| return T->getIntegerBitWidth(); |
| return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); |
| } |
| |
| Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { |
| assert(needsPromotionToI32(T) && "T does not need promotion to i32"); |
| |
| if (T->isIntegerTy()) |
| return B.getInt32Ty(); |
| return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); |
| } |
| |
| bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { |
| return I.getOpcode() == Instruction::AShr || |
| I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; |
| } |
| |
| bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { |
| return isa<ICmpInst>(I.getOperand(0)) ? |
| cast<ICmpInst>(I.getOperand(0))->isSigned() : false; |
| } |
| |
| bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { |
| const IntegerType *IntTy = dyn_cast<IntegerType>(T); |
| if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) |
| return true; |
| |
| if (const VectorType *VT = dyn_cast<VectorType>(T)) { |
| // TODO: The set of packed operations is more limited, so may want to |
| // promote some anyway. |
| if (ST->hasVOP3PInsts()) |
| return false; |
| |
| return needsPromotionToI32(VT->getElementType()); |
| } |
| |
| return false; |
| } |
| |
| // Return true if the op promoted to i32 should have nsw set. |
| static bool promotedOpIsNSW(const Instruction &I) { |
| switch (I.getOpcode()) { |
| case Instruction::Shl: |
| case Instruction::Add: |
| case Instruction::Sub: |
| return true; |
| case Instruction::Mul: |
| return I.hasNoUnsignedWrap(); |
| default: |
| return false; |
| } |
| } |
| |
| // Return true if the op promoted to i32 should have nuw set. |
| static bool promotedOpIsNUW(const Instruction &I) { |
| switch (I.getOpcode()) { |
| case Instruction::Shl: |
| case Instruction::Add: |
| case Instruction::Mul: |
| return true; |
| case Instruction::Sub: |
| return I.hasNoUnsignedWrap(); |
| default: |
| return false; |
| } |
| } |
| |
| bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { |
| assert(needsPromotionToI32(I.getType()) && |
| "I does not need promotion to i32"); |
| |
| if (I.getOpcode() == Instruction::SDiv || |
| I.getOpcode() == Instruction::UDiv) |
| return false; |
| |
| IRBuilder<> Builder(&I); |
| Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
| |
| Type *I32Ty = getI32Ty(Builder, I.getType()); |
| Value *ExtOp0 = nullptr; |
| Value *ExtOp1 = nullptr; |
| Value *ExtRes = nullptr; |
| Value *TruncRes = nullptr; |
| |
| if (isSigned(I)) { |
| ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); |
| ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
| } else { |
| ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); |
| ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
| } |
| |
| ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); |
| if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { |
| if (promotedOpIsNSW(cast<Instruction>(I))) |
| Inst->setHasNoSignedWrap(); |
| |
| if (promotedOpIsNUW(cast<Instruction>(I))) |
| Inst->setHasNoUnsignedWrap(); |
| |
| if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) |
| Inst->setIsExact(ExactOp->isExact()); |
| } |
| |
| TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); |
| |
| I.replaceAllUsesWith(TruncRes); |
| I.eraseFromParent(); |
| |
| return true; |
| } |
| |
| bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { |
| assert(needsPromotionToI32(I.getOperand(0)->getType()) && |
| "I does not need promotion to i32"); |
| |
| IRBuilder<> Builder(&I); |
| Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
| |
| Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); |
| Value *ExtOp0 = nullptr; |
| Value *ExtOp1 = nullptr; |
| Value *NewICmp = nullptr; |
| |
| if (I.isSigned()) { |
| ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); |
| ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
| } else { |
| ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); |
| ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
| } |
| NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); |
| |
| I.replaceAllUsesWith(NewICmp); |
| I.eraseFromParent(); |
| |
| return true; |
| } |
| |
| bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { |
| assert(needsPromotionToI32(I.getType()) && |
| "I does not need promotion to i32"); |
| |
| IRBuilder<> Builder(&I); |
| Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
| |
| Type *I32Ty = getI32Ty(Builder, I.getType()); |
| Value *ExtOp1 = nullptr; |
| Value *ExtOp2 = nullptr; |
| Value *ExtRes = nullptr; |
| Value *TruncRes = nullptr; |
| |
| if (isSigned(I)) { |
| ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); |
| ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); |
| } else { |
| ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); |
| ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); |
| } |
| ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); |
| TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); |
| |
| I.replaceAllUsesWith(TruncRes); |
| I.eraseFromParent(); |
| |
| return true; |
| } |
| |
| bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( |
| IntrinsicInst &I) const { |
| assert(I.getIntrinsicID() == Intrinsic::bitreverse && |
| "I must be bitreverse intrinsic"); |
| assert(needsPromotionToI32(I.getType()) && |
| "I does not need promotion to i32"); |
| |
| IRBuilder<> Builder(&I); |
| Builder.SetCurrentDebugLocation(I.getDebugLoc()); |
| |
| Type *I32Ty = getI32Ty(Builder, I.getType()); |
| Function *I32 = |
| Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); |
| Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); |
| Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); |
| Value *LShrOp = |
| Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); |
| Value *TruncRes = |
| Builder.CreateTrunc(LShrOp, I.getType()); |
| |
| I.replaceAllUsesWith(TruncRes); |
| I.eraseFromParent(); |
| |
| return true; |
| } |
| |
| static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { |
| const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); |
| if (!CNum) |
| return false; |
| |
| // Reciprocal f32 is handled separately without denormals. |
| return UnsafeDiv || CNum->isExactlyValue(+1.0); |
| } |
| |
| // Insert an intrinsic for fast fdiv for safe math situations where we can |
| // reduce precision. Leave fdiv for situations where the generic node is |
| // expected to be optimized. |
| bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { |
| Type *Ty = FDiv.getType(); |
| |
| if (!Ty->getScalarType()->isFloatTy()) |
| return false; |
| |
| MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); |
| if (!FPMath) |
| return false; |
| |
| const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); |
| float ULP = FPOp->getFPAccuracy(); |
| if (ULP < 2.5f) |
| return false; |
| |
| FastMathFlags FMF = FPOp->getFastMathFlags(); |
| bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || |
| FMF.allowReciprocal(); |
| |
| // With UnsafeDiv node will be optimized to just rcp and mul. |
| if (ST->hasFP32Denormals() || UnsafeDiv) |
| return false; |
| |
| IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); |
| Builder.setFastMathFlags(FMF); |
| Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); |
| |
| Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); |
| |
| Value *Num = FDiv.getOperand(0); |
| Value *Den = FDiv.getOperand(1); |
| |
| Value *NewFDiv = nullptr; |
| |
| if (VectorType *VT = dyn_cast<VectorType>(Ty)) { |
| NewFDiv = UndefValue::get(VT); |
| |
| // FIXME: Doesn't do the right thing for cases where the vector is partially |
| // constant. This works when the scalarizer pass is run first. |
| for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { |
| Value *NumEltI = Builder.CreateExtractElement(Num, I); |
| Value *DenEltI = Builder.CreateExtractElement(Den, I); |
| Value *NewElt; |
| |
| if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { |
| NewElt = Builder.CreateFDiv(NumEltI, DenEltI); |
| } else { |
| NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); |
| } |
| |
| NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); |
| } |
| } else { |
| if (!shouldKeepFDivF32(Num, UnsafeDiv)) |
| NewFDiv = Builder.CreateCall(Decl, { Num, Den }); |
| } |
| |
| if (NewFDiv) { |
| FDiv.replaceAllUsesWith(NewFDiv); |
| NewFDiv->takeName(&FDiv); |
| FDiv.eraseFromParent(); |
| } |
| |
| return true; |
| } |
| |
| static bool hasUnsafeFPMath(const Function &F) { |
| Attribute Attr = F.getFnAttribute("unsafe-fp-math"); |
| return Attr.getValueAsString() == "true"; |
| } |
| |
| bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { |
| bool Changed = false; |
| |
| if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && |
| DA->isUniform(&I)) |
| Changed |= promoteUniformOpToI32(I); |
| |
| return Changed; |
| } |
| |
| bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { |
| bool Changed = false; |
| |
| if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && |
| DA->isUniform(&I)) |
| Changed |= promoteUniformOpToI32(I); |
| |
| return Changed; |
| } |
| |
| bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { |
| bool Changed = false; |
| |
| if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && |
| DA->isUniform(&I)) |
| Changed |= promoteUniformOpToI32(I); |
| |
| return Changed; |
| } |
| |
| bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { |
| switch (I.getIntrinsicID()) { |
| case Intrinsic::bitreverse: |
| return visitBitreverseIntrinsicInst(I); |
| default: |
| return false; |
| } |
| } |
| |
| bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { |
| bool Changed = false; |
| |
| if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && |
| DA->isUniform(&I)) |
| Changed |= promoteUniformBitreverseToI32(I); |
| |
| return Changed; |
| } |
| |
| bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { |
| Mod = &M; |
| return false; |
| } |
| |
| bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { |
| if (skipFunction(F)) |
| return false; |
| |
| auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
| if (!TPC) |
| return false; |
| |
| const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
| ST = &TM.getSubtarget<SISubtarget>(F); |
| DA = &getAnalysis<DivergenceAnalysis>(); |
| HasUnsafeFPMath = hasUnsafeFPMath(F); |
| |
| bool MadeChange = false; |
| |
| for (BasicBlock &BB : F) { |
| BasicBlock::iterator Next; |
| for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { |
| Next = std::next(I); |
| MadeChange |= visit(*I); |
| } |
| } |
| |
| return MadeChange; |
| } |
| |
| INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, |
| "AMDGPU IR optimizations", false, false) |
| INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) |
| INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", |
| false, false) |
| |
| char AMDGPUCodeGenPrepare::ID = 0; |
| |
| FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { |
| return new AMDGPUCodeGenPrepare(); |
| } |