blob: d6778ec666cbeabccbf1ec32b5e77b0529e530c3 [file] [log] [blame]
//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass implements IR expansion for reduction intrinsics, allowing targets
// to enable the intrinsics until just before codegen.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ExpandReductions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
namespace {
bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
bool Changed = false;
SmallVector<IntrinsicInst *, 4> Worklist;
for (auto &I : instructions(F)) {
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::vector_reduce_fadd:
case Intrinsic::vector_reduce_fmul:
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_mul:
case Intrinsic::vector_reduce_and:
case Intrinsic::vector_reduce_or:
case Intrinsic::vector_reduce_xor:
case Intrinsic::vector_reduce_smax:
case Intrinsic::vector_reduce_smin:
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin:
case Intrinsic::vector_reduce_fmax:
case Intrinsic::vector_reduce_fmin:
if (TTI->shouldExpandReduction(II))
Worklist.push_back(II);
break;
}
}
}
for (auto *II : Worklist) {
FastMathFlags FMF =
isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
Intrinsic::ID ID = II->getIntrinsicID();
RecurKind RK = getMinMaxReductionRecurKind(ID);
TargetTransformInfo::ReductionShuffle RS =
TTI->getPreferredExpandedReductionShuffle(II);
Value *Rdx = nullptr;
IRBuilder<> Builder(II);
IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
Builder.setFastMathFlags(FMF);
switch (ID) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::vector_reduce_fadd:
case Intrinsic::vector_reduce_fmul: {
// FMFs must be attached to the call, otherwise it's an ordered reduction
// and it can't be handled by generating a shuffle sequence.
Value *Acc = II->getArgOperand(0);
Value *Vec = II->getArgOperand(1);
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
if (!FMF.allowReassoc())
Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
else {
if (!isPowerOf2_32(
cast<FixedVectorType>(Vec->getType())->getNumElements()))
continue;
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
"bin.rdx");
}
break;
}
case Intrinsic::vector_reduce_and:
case Intrinsic::vector_reduce_or: {
// Canonicalize logical or/and reductions:
// Or reduction for i1 is represented as:
// %val = bitcast <ReduxWidth x i1> to iReduxWidth
// %res = cmp ne iReduxWidth %val, 0
// And reduction for i1 is represented as:
// %val = bitcast <ReduxWidth x i1> to iReduxWidth
// %res = cmp eq iReduxWidth %val, 11111
Value *Vec = II->getArgOperand(0);
auto *FTy = cast<FixedVectorType>(Vec->getType());
unsigned NumElts = FTy->getNumElements();
if (!isPowerOf2_32(NumElts))
continue;
if (FTy->getElementType() == Builder.getInt1Ty()) {
Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
if (ID == Intrinsic::vector_reduce_and) {
Rdx = Builder.CreateICmpEQ(
Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
} else {
assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
Rdx = Builder.CreateIsNotNull(Rdx);
}
break;
}
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
break;
}
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_mul:
case Intrinsic::vector_reduce_xor:
case Intrinsic::vector_reduce_smax:
case Intrinsic::vector_reduce_smin:
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: {
Value *Vec = II->getArgOperand(0);
if (!isPowerOf2_32(
cast<FixedVectorType>(Vec->getType())->getNumElements()))
continue;
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
break;
}
case Intrinsic::vector_reduce_fmax:
case Intrinsic::vector_reduce_fmin: {
// We require "nnan" to use a shuffle reduction; "nsz" is implied by the
// semantics of the reduction.
Value *Vec = II->getArgOperand(0);
if (!isPowerOf2_32(
cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
!FMF.noNaNs())
continue;
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
break;
}
}
II->replaceAllUsesWith(Rdx);
II->eraseFromParent();
Changed = true;
}
return Changed;
}
class ExpandReductions : public FunctionPass {
public:
static char ID;
ExpandReductions() : FunctionPass(ID) {
initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return expandReductions(F, TTI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.setPreservesCFG();
}
};
}
char ExpandReductions::ID;
INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
"Expand reduction intrinsics", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
"Expand reduction intrinsics", false, false)
FunctionPass *llvm::createExpandReductionsPass() {
return new ExpandReductions();
}
PreservedAnalyses ExpandReductionsPass::run(Function &F,
FunctionAnalysisManager &AM) {
const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
if (!expandReductions(F, &TTI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return PA;
}