blob: 65e6ed9d1d428cae13e6142ed88b0618f1588eeb [file] [log] [blame] [edit]
//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass simplifies certain intrinsic calls when the arguments are uniform.
/// It's true that this pass has transforms that can lead to a situation where
/// some instruction whose operand was previously recognized as statically
/// uniform is later on no longer recognized as statically uniform. However, the
/// semantics of how programs execute don't (and must not, for this precise
/// reason) care about static uniformity, they only ever care about dynamic
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::PatternMatch;
/// Wrapper for querying uniformity info that first checks locally tracked
/// instructions.
static bool
isDivergentUseWithNew(const Use &U, const UniformityInfo &UI,
const ValueMap<const Value *, bool> &Tracker) {
Value *V = U.get();
if (auto It = Tracker.find(V); It != Tracker.end())
return !It->second; // divergent if marked false
return UI.isDivergentUse(U);
}
/// Optimizes uniform intrinsics calls if their operand can be proven uniform.
static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
II.replaceAllUsesWith(Src);
II.eraseFromParent();
return true;
}
case Intrinsic::amdgcn_ballot: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
return false;
LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
bool Changed = false;
for (User *U : make_early_inc_range(II.users())) {
if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
Value *Op0 = ICmp->getOperand(0);
Value *Op1 = ICmp->getOperand(1);
ICmpInst::Predicate Pred = ICmp->getPredicate();
Value *OtherOp = Op0 == &II ? Op1 : Op0;
if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
// Case: (icmp eq %ballot, 0) -> xor %ballot_arg, 1
Instruction *NotOp =
BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
Changed = true;
}
}
}
// Erase the intrinsic if it has no remaining uses.
if (II.use_empty())
II.eraseFromParent();
return Changed;
}
default:
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
}
return false;
}
/// Iterates over intrinsic calls in the Function to optimize.
static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;
for (Instruction &I : make_early_inc_range(instructions(F))) {
auto *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_ballot:
break;
default:
continue;
}
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
}
PreservedAnalyses
AMDGPUUniformIntrinsicCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
if (!runUniformIntrinsicCombine(F, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
namespace {
class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
public:
static char ID;
AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
initializeAMDGPUUniformIntrinsicCombineLegacyPass(
*PassRegistry::getPassRegistry());
}
private:
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
};
} // namespace
char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
AMDGPUUniformIntrinsicCombineLegacy::ID;
bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
const UniformityInfo &UI =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
return runUniformIntrinsicCombine(F, UI);
}
INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU Uniform Intrinsic Combine", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
"AMDGPU Uniform Intrinsic Combine", false, false)
FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
return new AMDGPUUniformIntrinsicCombineLegacy();
}