blob: 88f531f3947650e902b7ac3b57ada29d44fc0438 [file] [log] [blame]
//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements distribution utility methods.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/DenseMap.h"
#include <numeric>
using namespace mlir;
using namespace mlir::gpu;
WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes) const {
// Create a new op before the existing one, with the extra operands.
OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(warpOp);
auto newWarpOp = WarpExecuteOnLane0Op::create(
rewriter, warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(),
warpOp.getWarpSize(), warpOp.getArgs(),
warpOp.getBody()->getArgumentTypes());
Region &opBody = warpOp.getBodyRegion();
Region &newOpBody = newWarpOp.getBodyRegion();
Block &newOpFirstBlock = newOpBody.front();
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
rewriter.eraseBlock(&newOpFirstBlock);
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
"expected WarpOp with single block");
auto yield =
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
rewriter.modifyOpInPlace(
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
return newWarpOp;
}
WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes,
SmallVector<size_t> &indices) const {
SmallVector<Type> types(warpOp.getResultTypes().begin(),
warpOp.getResultTypes().end());
gpu::YieldOp yield = warpOp.getTerminator();
SmallVector<Value> yieldValues(yield.getOperands().begin(),
yield.getOperands().end());
llvm::SmallDenseMap<Value, unsigned> indexLookup;
// Record the value -> first index mapping for faster lookup.
for (auto [i, v] : llvm::enumerate(yieldValues)) {
if (!indexLookup.count(v))
indexLookup[v] = i;
}
for (auto [value, type] : llvm::zip_equal(newYieldedValues, newReturnTypes)) {
// If the value already exists in the yield, don't create a new output.
if (indexLookup.count(value)) {
indices.push_back(indexLookup[value]);
} else {
// If the value is new, add it to the yield and to the types.
yieldValues.push_back(value);
types.push_back(type);
indices.push_back(yieldValues.size() - 1);
}
}
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
rewriter, warpOp, yieldValues, types);
rewriter.replaceOp(warpOp,
newWarpOp.getResults().take_front(warpOp.getNumResults()));
return newWarpOp;
}
OpOperand *WarpDistributionPattern::getWarpResult(
WarpExecuteOnLane0Op warpOp,
llvm::function_ref<bool(Operation *)> fn) const {
gpu::YieldOp yield = warpOp.getTerminator();
for (OpOperand &yieldOperand : yield->getOpOperands()) {
Value yieldValues = yieldOperand.get();
Operation *definedOp = yieldValues.getDefiningOp();
if (definedOp && fn(definedOp)) {
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
return &yieldOperand;
}
}
return nullptr;
}
bool WarpDistributionPattern::delinearizeLaneId(
OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
SmallVectorImpl<Value> &delinearizedIds) const {
// If the original shape and the distributed shape is the same, we don't
// distribute at all--every thread is handling the whole. For such case, we
// should not rely on lane IDs later. So just return an empty lane ID vector.
if (originalShape == distributedShape) {
delinearizedIds.clear();
return true;
}
SmallVector<int64_t> sizes;
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
if (large % small != 0)
return false;
sizes.push_back(large / small);
}
if (std::accumulate(sizes.begin(), sizes.end(), 1,
std::multiplies<int64_t>()) != warpSize)
return false;
AffineExpr s0, s1;
bindSymbols(builder.getContext(), s0, s1);
int64_t usedThreads = 1;
Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
delinearizedIds.assign(sizes.size(), zero);
for (int i = sizes.size() - 1; i >= 0; --i) {
usedThreads *= sizes[i];
if (usedThreads == warpSize) {
// We've used up all available threads. Don't need to perform modulo
// anymore. And we can stop the calculation for further dimensions.
delinearizedIds[i] = laneId;
break;
}
delinearizedIds[i] =
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
laneId = affine::makeComposedAffineApply(
builder, loc, s0.floorDiv(usedThreads), {laneId});
}
return true;
}