blob: 4ce5432d67fcf2d6aa5a24c027b35ea832195c40 [file] [edit]
//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a RISC-V specific version of CodeGenPrepare.
// It munges the code in the input function to better prepare it for
// SelectionDAG-based code generation. This works around limitations in it's
// basic-block-at-a-time approach.
//
//===----------------------------------------------------------------------===//
#include "RISCV.h"
#include "RISCVTargetMachine.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "riscv-codegenprepare"
#define PASS_NAME "RISC-V CodeGenPrepare"
namespace {
class RISCVCodeGenPrepare : public InstVisitor<RISCVCodeGenPrepare, bool> {
Function &F;
const DataLayout *DL;
const DominatorTree *DT;
const RISCVSubtarget *ST;
public:
RISCVCodeGenPrepare(Function &F, const DominatorTree *DT,
const RISCVSubtarget *ST)
: F(F), DL(&F.getDataLayout()), DT(DT), ST(ST) {}
bool run();
bool visitInstruction(Instruction &I) { return false; }
bool visitAnd(BinaryOperator &BO);
bool visitIntrinsicInst(IntrinsicInst &I);
bool expandVPStrideLoad(IntrinsicInst &I);
bool expandMulReduction(IntrinsicInst &I);
bool widenVPMerge(Instruction *I);
bool visitFreezeInst(FreezeInst &BO);
};
} // namespace
namespace {
class RISCVCodeGenPrepareLegacyPass : public FunctionPass {
public:
static char ID;
RISCVCodeGenPrepareLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return PASS_NAME; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
};
} // namespace
// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
// but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill
// the upper 32 bits with ones.
bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
if (!ST->is64Bit())
return false;
if (!BO.getType()->isIntegerTy(64))
return false;
using namespace PatternMatch;
// Left hand side should be a zext nneg.
Value *LHSSrc;
if (!match(BO.getOperand(0), m_NNegZExt(m_Value(LHSSrc))))
return false;
if (!LHSSrc->getType()->isIntegerTy(32))
return false;
// Right hand side should be a constant.
Value *RHS = BO.getOperand(1);
auto *CI = dyn_cast<ConstantInt>(RHS);
if (!CI)
return false;
uint64_t C = CI->getZExtValue();
// Look for constants that fit in 32 bits but not simm12, and can be made
// into simm12 by sign extending bit 31. This will allow use of ANDI.
// TODO: Is worth making simm32?
if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C)))
return false;
// Sign extend the constant and replace the And operand.
C = SignExtend64<32>(C);
BO.setOperand(1, ConstantInt::get(RHS->getType(), C));
return true;
}
// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
// follows:
//
// loop:
// %phi = phi <vscale x 4 x i1> [zeroinitializer, %entry], [%freeze, %loop]
// %cmp = icmp ...
// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%cmp, i1 true, %phi, %evl)
// %freeze = freeze <vscale x 4 x i1> %rec [optional]
// ...
// middle:
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %freeze)
//
// However RVV doesn't have any tail undisturbed mask instructions and so we
// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
//
// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
// generate a single vmerge.vim:
//
// loop:
// %phi = phi <vscale x 4 x i8> [zeroinitializer, %entry], [%freeze, %loop]
// %cmp = icmp ...
// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%cmp, i8 true, %phi, %evl)
// %freeze = freeze <vscale x 4 x i8> %rec
// %trunc = trunc <vscale x 4 x i8> %freeze to <vscale x 4 x i1>
// ...
// middle:
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %trunc)
//
// The trunc will normally be sunk outside of the loop, but even if there are
// users inside the loop it is still profitable.
bool RISCVCodeGenPrepare::widenVPMerge(Instruction *Root) {
if (!Root->getType()->getScalarType()->isIntegerTy(1))
return false;
Value *Mask, *True, *PhiV, *EVL;
using namespace PatternMatch;
auto m_VPMerge = m_Intrinsic<Intrinsic::vp_merge>(
m_Value(Mask), m_Value(True), m_Value(PhiV), m_Value(EVL));
if (!match(Root, m_CombineOr(m_VPMerge, m_Freeze(m_VPMerge))))
return false;
auto *Phi = dyn_cast<PHINode>(PhiV);
if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 ||
!match(Phi->getIncomingValue(0), m_Zero()) ||
Phi->getIncomingValue(1) != Root)
return false;
Type *WideTy =
VectorType::get(IntegerType::getInt8Ty(Root->getContext()),
cast<VectorType>(Root->getType())->getElementCount());
IRBuilder<> Builder(Phi);
PHINode *WidePhi = Builder.CreatePHI(WideTy, 2);
WidePhi->addIncoming(ConstantAggregateZero::get(WideTy),
Phi->getIncomingBlock(0));
Builder.SetInsertPoint(Root);
Value *WideTrue = Builder.CreateZExt(True, WideTy);
Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy},
{Mask, WideTrue, WidePhi, EVL});
if (isa<FreezeInst>(Root))
WideMerge = Builder.CreateFreeze(WideMerge);
WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1));
Value *Trunc = Builder.CreateTrunc(WideMerge, Root->getType());
Root->replaceAllUsesWith(Trunc);
// Break the cycle and delete the old chain.
Phi->setIncomingValue(1, Phi->getIncomingValue(0));
llvm::RecursivelyDeleteTriviallyDeadInstructions(Root);
return true;
}
bool RISCVCodeGenPrepare::visitFreezeInst(FreezeInst &I) {
if (auto *II = dyn_cast<IntrinsicInst>(I.getOperand(0)))
if (II->getIntrinsicID() == Intrinsic::vp_merge)
return widenVPMerge(&I);
return false;
}
// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
// reduction instructions write the result in the first element of a vector
// register. So when a reduction in a loop uses a scalar phi, we end up with
// unnecessary scalar moves:
//
// loop:
// vfmv.s.f v10, fa0
// vfredosum.vs v8, v8, v10
// vfmv.f.s fa0, v8
//
// This mainly affects ordered fadd reductions and VP reductions that have a
// scalar start value, since other types of reduction typically use element-wise
// vectorisation in the loop body. This tries to vectorize any scalar phis that
// feed into these reductions:
//
// loop:
// %phi = phi <float> [ ..., %entry ], [ %acc, %loop ]
// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %phi,
// <vscale x 2 x float> %vec)
//
// ->
//
// loop:
// %phi = phi <vscale x 2 x float> [ ..., %entry ], [ %acc.vec, %loop ]
// %phi.scalar = extractelement <vscale x 2 x float> %phi, i64 0
// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %x,
// <vscale x 2 x float> %vec)
// %acc.vec = insertelement <vscale x 2 x float> poison, float %acc.next, i64 0
//
// Which eliminates the scalar -> vector -> scalar crossing during instruction
// selection.
bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
if (expandVPStrideLoad(I))
return true;
if (expandMulReduction(I))
return true;
if (widenVPMerge(&I))
return true;
if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
!isa<VPReductionIntrinsic>(&I))
return false;
auto *PHI = dyn_cast<PHINode>(I.getOperand(0));
if (!PHI || !PHI->hasOneUse() ||
!llvm::is_contained(PHI->incoming_values(), &I))
return false;
Type *VecTy = I.getOperand(1)->getType();
IRBuilder<> Builder(PHI);
auto *VecPHI = Builder.CreatePHI(VecTy, PHI->getNumIncomingValues());
for (auto *BB : PHI->blocks()) {
Builder.SetInsertPoint(BB->getTerminator());
Value *InsertElt = Builder.CreateInsertElement(
VecTy, PHI->getIncomingValueForBlock(BB), (uint64_t)0);
VecPHI->addIncoming(InsertElt, BB);
}
Builder.SetInsertPoint(&I);
I.setOperand(0, Builder.CreateExtractElement(VecPHI, (uint64_t)0));
PHI->eraseFromParent();
return true;
}
// Partially expand a vector_reduce_mul wider than M1 to reduce the
// number of vsetvlis required when VLEN is exactly known, and
// reducing register pressure in all cases.
bool RISCVCodeGenPrepare::expandMulReduction(IntrinsicInst &II) {
if (II.getIntrinsicID() != Intrinsic::vector_reduce_mul)
return false;
if (!ST->hasVInstructions())
return false;
Value *TmpVec = II.getArgOperand(0);
auto *VecTy = dyn_cast<FixedVectorType>(TmpVec->getType());
if (!VecTy)
return false;
unsigned EltSize = VecTy->getScalarSizeInBits();
unsigned VF = VecTy->getNumElements();
unsigned MinVLen = ST->getRealMinVLen();
unsigned M1VF = MinVLen / EltSize;
if (!isPowerOf2_32(VF) || VF <= M1VF)
return false;
IRBuilder<> Builder(&II);
auto *M1Ty = FixedVectorType::get(VecTy->getElementType(), M1VF);
// When VLEN is exactly known, extract m1 pieces and build a mul tree.
// This greatly reduces register pressure during the reduction, and
// avoids all but one vsetvli (the one from original LMUL to m1).
// TODO: Generalize to handle the splitting case.
if (MinVLen == ST->getRealMaxVLen() && VF <= 8 * M1VF) {
unsigned NumM1 = VF / M1VF;
assert(isPowerOf2_32(NumM1) && NumM1 <= 8);
SmallVector<Value *, 8> Pieces(NumM1);
for (unsigned i = 0; i < NumM1; i++)
Pieces[i] =
Builder.CreateExtractVector(M1Ty, TmpVec, (uint64_t)(i * M1VF));
while (Pieces.size() > 1) {
for (unsigned i = 0; i < Pieces.size() / 2; i++)
Pieces[i] =
Builder.CreateMul(Pieces[i * 2], Pieces[i * 2 + 1], "bin.rdx");
Pieces.truncate(Pieces.size() / 2);
}
TmpVec = Pieces[0];
} else {
// For non-exact VLEN, shuffle-reduce at the original vector width down to
// m1, then extract. This prioritizes reducing the number of vsetvli
// over maximual reduction of LMUL for the intermediate states.
SmallVector<int, 32> ShuffleMask(VF);
for (unsigned LiveElts = VF; LiveElts > M1VF; LiveElts /= 2) {
unsigned Half = LiveElts / 2;
std::iota(ShuffleMask.begin(), ShuffleMask.begin() + Half, Half);
std::fill(ShuffleMask.begin() + Half, ShuffleMask.end(), -1);
Value *Shuf =
Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
TmpVec = Builder.CreateMul(TmpVec, Shuf, "bin.rdx");
}
// Extract the M1-sized subvector and emit the final reduction intrinsic.
// This is the reason we're here - to force a vsetvli toggle once at m1.
TmpVec = Builder.CreateExtractVector(M1Ty, TmpVec, (uint64_t)0, "rdx.sub");
}
Value *Rdx =
Builder.CreateIntrinsic(Intrinsic::vector_reduce_mul, {M1Ty}, {TmpVec});
II.replaceAllUsesWith(Rdx);
II.eraseFromParent();
return true;
}
// Always expand zero strided loads so we match more .vx splat patterns, even if
// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert
// it back to a strided load if it's optimized.
bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
Value *BasePtr, *VL;
using namespace PatternMatch;
if (!match(&II, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
m_Value(BasePtr), m_Zero(), m_AllOnes(), m_Value(VL))))
return false;
// If SEW>XLEN then a splat will get lowered as a zero strided load anyway, so
// avoid expanding here.
if (II.getType()->getScalarSizeInBits() > ST->getXLen())
return false;
if (!isKnownNonZero(VL, {*DL, DT, nullptr, &II}))
return false;
auto *VTy = cast<VectorType>(II.getType());
IRBuilder<> Builder(&II);
Type *STy = VTy->getElementType();
Value *Val = Builder.CreateLoad(STy, BasePtr);
Value *Res = Builder.CreateIntrinsic(
Intrinsic::vp_merge, VTy,
{II.getOperand(2), Builder.CreateVectorSplat(VTy->getElementCount(), Val),
PoisonValue::get(VTy), VL});
II.replaceAllUsesWith(Res);
II.eraseFromParent();
return true;
}
bool RISCVCodeGenPrepare::run() {
bool MadeChange = false;
for (auto &BB : F)
for (Instruction &I : llvm::make_early_inc_range(BB))
MadeChange |= visit(I);
return MadeChange;
}
bool RISCVCodeGenPrepareLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
auto &TPC = getAnalysis<TargetPassConfig>();
auto &TM = TPC.getTM<RISCVTargetMachine>();
auto ST = &TM.getSubtarget<RISCVSubtarget>(F);
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
RISCVCodeGenPrepare RVCGP(F, DT, ST);
return RVCGP.run();
}
INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME,
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, false,
false)
char RISCVCodeGenPrepareLegacyPass::ID = 0;
FunctionPass *llvm::createRISCVCodeGenPrepareLegacyPass() {
return new RISCVCodeGenPrepareLegacyPass();
}
PreservedAnalyses RISCVCodeGenPreparePass::run(Function &F,
FunctionAnalysisManager &FAM) {
DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
auto ST = &TM->getSubtarget<RISCVSubtarget>(F);
bool Changed = RISCVCodeGenPrepare(F, DT, ST).run();
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA = PreservedAnalyses::none();
PA.preserveSet<CFGAnalyses>();
return PA;
}