blob: ba44b4a702a2cc5705c6f8656c0a2b54f35461af [file] [log] [blame]
//===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements loop software pipelining
//
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SCF/Transforms.h"
#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Support/MathExtras.h"
using namespace mlir;
using namespace mlir::scf;
namespace {
/// Helper to keep internal information during pipelining transformation.
struct LoopPipelinerInternal {
/// Coarse liverange information for ops used across stages.
struct LiverangeInfo {
unsigned lastUseStage = 0;
unsigned defStage = 0;
};
protected:
ForOp forOp;
unsigned maxStage = 0;
DenseMap<Operation *, unsigned> stages;
std::vector<Operation *> opOrder;
int64_t ub;
int64_t lb;
int64_t step;
// When peeling the kernel we generate several version of each value for
// different stage of the prologue. This map tracks the mapping between
// original Values in the loop and the different versions
// peeled from the loop.
DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
/// Assign a value to `valueMapping`, this means `val` represents the version
/// `idx` of `key` in the epilogue.
void setValueMapping(Value key, Value el, int64_t idx);
public:
/// Initalize the information for the given `op`, return true if it
/// satisfies the pre-condition to apply pipelining.
bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
/// Emits the prologue, this creates `maxStage - 1` part which will contain
/// operations from stages [0; i], where i is the part index.
void emitPrologue(PatternRewriter &rewriter);
/// Gather liverange information for Values that are used in a different stage
/// than its definition.
llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
scf::ForOp createKernelLoop(
const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
PatternRewriter &rewriter,
llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
/// Emits the pipelined kernel. This clones loop operations following user
/// order and remaps operands defined in a different stage as their use.
void createKernel(
scf::ForOp newForOp,
const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
PatternRewriter &rewriter);
/// Emits the epilogue, this creates `maxStage - 1` part which will contain
/// operations from stages [i; maxStage], where i is the part index.
llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
};
bool LoopPipelinerInternal::initializeLoopInfo(
ForOp op, const PipeliningOption &options) {
forOp = op;
auto upperBoundCst =
forOp.upperBound().getDefiningOp<arith::ConstantIndexOp>();
auto lowerBoundCst =
forOp.lowerBound().getDefiningOp<arith::ConstantIndexOp>();
auto stepCst = forOp.step().getDefiningOp<arith::ConstantIndexOp>();
if (!upperBoundCst || !lowerBoundCst || !stepCst)
return false;
ub = upperBoundCst.value();
lb = lowerBoundCst.value();
step = stepCst.value();
int64_t numIteration = ceilDiv(ub - lb, step);
std::vector<std::pair<Operation *, unsigned>> schedule;
options.getScheduleFn(forOp, schedule);
if (schedule.empty())
return false;
opOrder.reserve(schedule.size());
for (auto &opSchedule : schedule) {
maxStage = std::max(maxStage, opSchedule.second);
stages[opSchedule.first] = opSchedule.second;
opOrder.push_back(opSchedule.first);
}
if (numIteration <= maxStage)
return false;
// All operations need to have a stage.
if (forOp
.walk([this](Operation *op) {
if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
stages.find(op) == stages.end())
return WalkResult::interrupt();
return WalkResult::advance();
})
.wasInterrupted())
return false;
// Only support loop carried dependency with a distance of 1. This means the
// source of all the scf.yield operands needs to be defined by operations in
// the loop.
if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
[this](Value operand) {
Operation *def = operand.getDefiningOp();
return !def || stages.find(def) == stages.end();
}))
return false;
return true;
}
void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
// Initialize the iteration argument to the loop initiale values.
for (BlockArgument &arg : forOp.getRegionIterArgs()) {
OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
setValueMapping(arg, operand.get(), 0);
}
auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
for (int64_t i = 0; i < maxStage; i++) {
// special handling for induction variable as the increment is implicit.
Value iv = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i);
setValueMapping(forOp.getInductionVar(), iv, i);
for (Operation *op : opOrder) {
if (stages[op] > i)
continue;
Operation *newOp = rewriter.clone(*op);
for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
auto it = valueMapping.find(op->getOperand(opIdx));
if (it != valueMapping.end())
newOp->setOperand(opIdx, it->second[i - stages[op]]);
}
for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
setValueMapping(op->getResult(destId), newOp->getResult(destId),
i - stages[op]);
// If the value is a loop carried dependency update the loop argument
// mapping.
for (OpOperand &operand : yield->getOpOperands()) {
if (operand.get() != op->getResult(destId))
continue;
setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
newOp->getResult(destId), i - stages[op] + 1);
}
}
}
}
}
llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
LoopPipelinerInternal::analyzeCrossStageValues() {
llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
for (Operation *op : opOrder) {
unsigned stage = stages[op];
for (OpOperand &operand : op->getOpOperands()) {
Operation *def = operand.get().getDefiningOp();
if (!def)
continue;
auto defStage = stages.find(def);
if (defStage == stages.end() || defStage->second == stage)
continue;
assert(stage > defStage->second);
LiverangeInfo &info = crossStageValues[operand.get()];
info.defStage = defStage->second;
info.lastUseStage = std::max(info.lastUseStage, stage);
}
}
return crossStageValues;
}
scf::ForOp LoopPipelinerInternal::createKernelLoop(
const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
&crossStageValues,
PatternRewriter &rewriter,
llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
// Creates the list of initial values associated to values used across
// stages. The initial values come from the prologue created above.
// Keep track of the kernel argument associated to each version of the
// values passed to the kernel.
llvm::SmallVector<Value> newLoopArg;
// For existing loop argument initialize them with the right version from the
// prologue.
for (auto retVal :
llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
Operation *def = retVal.value().getDefiningOp();
assert(def && "Only support loop carried dependencies of distance 1");
unsigned defStage = stages[def];
Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
[maxStage - defStage];
assert(valueVersion);
newLoopArg.push_back(valueVersion);
}
for (auto escape : crossStageValues) {
LiverangeInfo &info = escape.second;
Value value = escape.first;
for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
stageIdx++) {
Value valueVersion =
valueMapping[value][maxStage - info.lastUseStage + stageIdx];
assert(valueVersion);
newLoopArg.push_back(valueVersion);
loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
stageIdx)] = newLoopArg.size() - 1;
}
}
// Create the new kernel loop. Since we need to peel `numStages - 1`
// iteration we change the upper bound to remove those iterations.
Value newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
ub - maxStage * step);
auto newForOp = rewriter.create<scf::ForOp>(
forOp.getLoc(), forOp.lowerBound(), newUb, forOp.step(), newLoopArg);
return newForOp;
}
void LoopPipelinerInternal::createKernel(
scf::ForOp newForOp,
const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
&crossStageValues,
const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
PatternRewriter &rewriter) {
valueMapping.clear();
// Create the kernel, we clone instruction based on the order given by
// user and remap operands coming from a previous stages.
rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
BlockAndValueMapping mapping;
mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
}
for (Operation *op : opOrder) {
int64_t useStage = stages[op];
auto *newOp = rewriter.clone(*op, mapping);
for (OpOperand &operand : op->getOpOperands()) {
// Special case for the induction variable uses. We replace it with a
// version incremented based on the stage where it is used.
if (operand.get() == forOp.getInductionVar()) {
rewriter.setInsertionPoint(newOp);
Value offset = rewriter.create<arith::ConstantIndexOp>(
forOp.getLoc(), (maxStage - stages[op]) * step);
Value iv = rewriter.create<arith::AddIOp>(
forOp.getLoc(), newForOp.getInductionVar(), offset);
newOp->setOperand(operand.getOperandNumber(), iv);
rewriter.setInsertionPointAfter(newOp);
continue;
}
auto arg = operand.get().dyn_cast<BlockArgument>();
if (arg && arg.getOwner() == forOp.getBody()) {
// If the value is a loop carried value coming from stage N + 1 remap,
// it will become a direct use.
Value ret = forOp.getBody()->getTerminator()->getOperand(
arg.getArgNumber() - 1);
Operation *dep = ret.getDefiningOp();
if (!dep)
continue;
auto stageDep = stages.find(dep);
if (stageDep == stages.end() || stageDep->second == useStage)
continue;
assert(stageDep->second == useStage + 1);
newOp->setOperand(operand.getOperandNumber(),
mapping.lookupOrDefault(ret));
continue;
}
// For operands defined in a previous stage we need to remap it to use
// the correct region argument. We look for the right version of the
// Value based on the stage where it is used.
Operation *def = operand.get().getDefiningOp();
if (!def)
continue;
auto stageDef = stages.find(def);
if (stageDef == stages.end() || stageDef->second == useStage)
continue;
auto remap = loopArgMap.find(
std::make_pair(operand.get(), useStage - stageDef->second));
assert(remap != loopArgMap.end());
newOp->setOperand(operand.getOperandNumber(),
newForOp.getRegionIterArgs()[remap->second]);
}
}
// Collect the Values that need to be returned by the forOp. For each
// value we need to have `LastUseStage - DefStage` number of versions
// returned.
// We create a mapping between original values and the associated loop
// returned values that will be needed by the epilogue.
llvm::SmallVector<Value> yieldOperands;
for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
yieldOperands.push_back(mapping.lookupOrDefault(retVal));
}
for (auto &it : crossStageValues) {
int64_t version = maxStage - it.second.lastUseStage + 1;
unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
// add the original verstion to yield ops.
// If there is a liverange spanning across more than 2 stages we need to add
// extra arg.
for (unsigned i = 1; i < numVersionReturned; i++) {
setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
version++);
yieldOperands.push_back(
newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
newForOp.getNumInductionVars()]);
}
setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
version++);
yieldOperands.push_back(mapping.lookupOrDefault(it.first));
}
// Map the yield operand to the forOp returned value.
for (auto retVal :
llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
Operation *def = retVal.value().getDefiningOp();
assert(def && "Only support loop carried dependencies of distance 1");
unsigned defStage = stages[def];
setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
newForOp->getResult(retVal.index()),
maxStage - defStage + 1);
}
rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
}
llvm::SmallVector<Value>
LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
llvm::SmallVector<Value> returnValues(forOp->getNumResults());
// Emit different versions of the induction variable. They will be
// removed by dead code if not used.
for (int64_t i = 0; i < maxStage; i++) {
Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
}
// Emit `maxStage - 1` epilogue part that includes operations fro stages
// [i; maxStage].
for (int64_t i = 1; i <= maxStage; i++) {
for (Operation *op : opOrder) {
if (stages[op] < i)
continue;
Operation *newOp = rewriter.clone(*op);
for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
auto it = valueMapping.find(op->getOperand(opIdx));
if (it != valueMapping.end()) {
Value v = it->second[maxStage - stages[op] + i];
assert(v);
newOp->setOperand(opIdx, v);
}
}
for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
setValueMapping(op->getResult(destId), newOp->getResult(destId),
maxStage - stages[op] + i);
// If the value is a loop carried dependency update the loop argument
// mapping and keep track of the last version to replace the original
// forOp uses.
for (OpOperand &operand :
forOp.getBody()->getTerminator()->getOpOperands()) {
if (operand.get() != op->getResult(destId))
continue;
unsigned version = maxStage - stages[op] + i + 1;
// If the version is greater than maxStage it means it maps to the
// original forOp returned value.
if (version > maxStage) {
returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
continue;
}
setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
newOp->getResult(destId), version);
}
}
}
}
return returnValues;
}
void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
auto it = valueMapping.find(key);
// If the value is not in the map yet add a vector big enough to store all
// versions.
if (it == valueMapping.end())
it =
valueMapping
.insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
.first;
it->second[idx] = el;
}
/// Generate a pipelined version of the scf.for loop based on the schedule given
/// as option. This applies the mechanical transformation of changing the loop
/// and generating the prologue/epilogue for the pipelining and doesn't make any
/// decision regarding the schedule.
/// Based on the option the loop is split into several stages.
/// The transformation assumes that the scheduling given by user is valid.
/// For example if we break a loop into 3 stages named S0, S1, S2 we would
/// generate the following code with the number in parenthesis the iteration
/// index:
/// S0(0) // Prologue
/// S0(1) S1(0) // Prologue
/// scf.for %I = %C0 to %N - 2 {
/// S0(I+2) S1(I+1) S2(I) // Pipelined kernel
/// }
/// S1(N) S2(N-1) // Epilogue
/// S2(N) // Epilogue
struct ForLoopPipelining : public OpRewritePattern<ForOp> {
ForLoopPipelining(const PipeliningOption &options, MLIRContext *context)
: OpRewritePattern<ForOp>(context), options(options) {}
LogicalResult matchAndRewrite(ForOp forOp,
PatternRewriter &rewriter) const override {
LoopPipelinerInternal pipeliner;
if (!pipeliner.initializeLoopInfo(forOp, options))
return failure();
// 1. Emit prologue.
pipeliner.emitPrologue(rewriter);
// 2. Track values used across stages. When a value cross stages it will
// need to be passed as loop iteration arguments.
// We first collect the values that are used in a different stage than where
// they are defined.
llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
crossStageValues = pipeliner.analyzeCrossStageValues();
// Mapping between original loop values used cross stage and the block
// arguments associated after pipelining. A Value may map to several
// arguments if its liverange spans across more than 2 stages.
llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
// 3. Create the new kernel loop and return the block arguments mapping.
ForOp newForOp =
pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
// Create the kernel block, order ops based on user choice and remap
// operands.
pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
// 4. Emit the epilogue after the new forOp.
rewriter.setInsertionPointAfter(newForOp);
llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter);
// 5. Erase the original loop and replace the uses with the epilogue output.
if (forOp->getNumResults() > 0)
rewriter.replaceOp(forOp, returnValues);
else
rewriter.eraseOp(forOp);
return success();
}
protected:
PipeliningOption options;
};
} // namespace
void mlir::scf::populateSCFLoopPipeliningPatterns(
RewritePatternSet &patterns, const PipeliningOption &options) {
patterns.add<ForLoopPipelining>(options, patterns.getContext());
}