blob: e4e3d72b4a4751011120fc33ae3cda8a0e916873 [file] [log] [blame]
//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the GPU dialect pattern rewriters that make GPU op
// within a region execute asynchronously.
//
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/GPU/Utils.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/TypeSwitch.h"
using namespace mlir;
namespace {
class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
struct ThreadTokenCallback;
struct DeferWaitCallback;
struct SingleTokenUseCallback;
void runOnFunction() override;
};
} // namespace
static bool isTerminator(Operation *op) {
return op->mightHaveTrait<OpTrait::IsTerminator>();
}
static bool hasSideEffects(Operation *op) {
return !MemoryEffectOpInterface::hasNoEffect(op);
}
// Region walk callback which makes GPU ops implementing the AsyncOpInterface
// execute asynchronously.
struct GpuAsyncRegionPass::ThreadTokenCallback {
ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
WalkResult operator()(Block *block) {
for (Operation &op : make_early_inc_range(*block)) {
if (failed(visit(&op)))
return WalkResult::interrupt();
}
return WalkResult::advance();
}
private:
// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
// create a current token (unless it already exists), and 'thread' that token
// through the `op` so that it executes asynchronously.
//
// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
// host-synchronize execution. A `!gpu.async.token` will therefore only be
// used inside of its block and GPU execution will always synchronize with
// the host at block boundaries.
LogicalResult visit(Operation *op) {
if (isa<gpu::LaunchOp>(op))
return op->emitOpError("replace with gpu.launch_func first");
if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
if (currentToken)
waitOp.addAsyncDependency(currentToken);
currentToken = waitOp.asyncToken();
return success();
}
builder.setInsertionPoint(op);
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
if (!currentToken)
return success();
// Insert host synchronization before terminator or op with side effects.
if (isTerminator(op) || hasSideEffects(op))
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
return success();
}
// Replaces asyncOp with a clone that returns a token.
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
auto *op = asyncOp.getOperation();
auto tokenType = builder.getType<gpu::AsyncTokenType>();
// If there is no current token, insert a `gpu.wait async` without
// dependencies to create one.
if (!currentToken)
currentToken = createWaitOp(op->getLoc(), tokenType, {});
asyncOp.addAsyncDependency(currentToken);
// Return early if op returns a token already.
currentToken = asyncOp.getAsyncToken();
if (currentToken)
return success();
// Clone the op to return a token in addition to the other results.
SmallVector<Type, 1> resultTypes;
resultTypes.reserve(1 + op->getNumResults());
copy(op->getResultTypes(), std::back_inserter(resultTypes));
resultTypes.push_back(tokenType);
auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
op->getOperands(), op->getAttrDictionary(),
op->getSuccessors(), op->getNumRegions());
// Clone regions into new op.
BlockAndValueMapping mapping;
for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions()))
std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping);
// Replace the op with the async clone.
auto results = newOp->getResults();
currentToken = results.back();
builder.insert(newOp);
op->replaceAllUsesWith(results.drop_back());
op->erase();
return success();
}
Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
}
OpBuilder builder;
// The token that represents the current asynchronous dependency. It's valid
// range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
// In between, each gpu::AsyncOpInterface depends on the current token and
// produces the new one.
Value currentToken = {};
};
/// Erases `executeOp` and returns a clone with additional `results`.
async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
ValueRange results) {
// Add values to async.yield op.
Operation *yieldOp = executeOp.getBody()->getTerminator();
yieldOp->insertOperands(yieldOp->getNumOperands(), results);
// Construct new result type list with additional types.
SmallVector<Type, 2> resultTypes;
resultTypes.reserve(executeOp.getNumResults() + results.size());
transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
[](Type type) {
// Extract value type from !async.value.
if (auto valueType = type.dyn_cast<async::ValueType>())
return valueType.getValueType();
assert(type.isa<async::TokenType>() && "expected token type");
return type;
});
transform(results, std::back_inserter(resultTypes),
[](Value value) { return value.getType(); });
// Clone executeOp with the extra results.
OpBuilder builder(executeOp);
auto newOp = builder.create<async::ExecuteOp>(
executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
executeOp.dependencies(), executeOp.operands());
BlockAndValueMapping mapper;
newOp.getRegion().getBlocks().clear();
executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
// Replace executeOp with cloned one.
executeOp.getOperation()->replaceAllUsesWith(
newOp.getResults().drop_back(results.size()));
executeOp.erase();
return newOp;
}
// Callback for `async.execute` ops which tries to push the contained
// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
struct GpuAsyncRegionPass::DeferWaitCallback {
// If the `executeOp`s token is used only in `async.execute` or `async.await`
// ops, add the region's last `gpu.wait` op to the worklist if it is
// synchronous and is the last op with side effects.
void operator()(async::ExecuteOp executeOp) {
if (!areAllUsersExecuteOrAwait(executeOp.token()))
return;
// async.execute's region is currently restricted to one block.
for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
if (!waitOp.asyncToken())
worklist.push_back(waitOp);
return;
}
if (hasSideEffects(&op))
return;
}
}
// The destructor performs the actual rewrite work.
~DeferWaitCallback() {
for (size_t i = 0; i < worklist.size(); ++i) {
auto waitOp = worklist[i];
auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
// Erase `gpu.wait` and return async dependencies from execute op instead.
SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
waitOp.erase();
executeOp = addExecuteResults(executeOp, dependencies);
// Add the async dependency to each user of the `async.execute` token.
auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
for (Operation *user : executeOp.token().getUsers())
addAsyncDependencyAfter(asyncTokens, user);
}
}
private:
// Returns whether all token users are either 'async.execute' or 'async.await'
// ops. This is used as a requirement for pushing 'gpu.wait' ops from a
// 'async.execute' body to it's users. Specifically, we do not allow
// terminator users, because it could mean that the `async.execute` is inside
// control flow code.
static bool areAllUsersExecuteOrAwait(Value token) {
return !token.use_empty() &&
llvm::all_of(token.getUsers(), [](Operation *user) {
return isa<async::ExecuteOp, async::AwaitOp>(user);
});
}
// Add the `asyncToken` as dependency as needed after `op`.
void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
OpBuilder builder(op->getContext());
auto loc = op->getLoc();
Block::iterator it;
SmallVector<Value, 1> tokens;
tokens.reserve(asyncTokens.size());
TypeSwitch<Operation *>(op)
.Case<async::AwaitOp>([&](auto awaitOp) {
// Add async.await ops to wait for the !gpu.async.tokens.
builder.setInsertionPointAfter(op);
for (auto asyncToken : asyncTokens)
tokens.push_back(
builder.create<async::AwaitOp>(loc, asyncToken).result());
// Set `it` after the inserted async.await ops.
it = builder.getInsertionPoint();
})
.Case<async::ExecuteOp>([&](auto executeOp) {
// Set `it` to the beginning of the region and add asyncTokens to the
// async.execute operands.
it = executeOp.getBody()->begin();
executeOp.operandsMutable().append(asyncTokens);
SmallVector<Type, 1> tokenTypes(
asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
copy(executeOp.getBody()->addArguments(tokenTypes),
std::back_inserter(tokens));
});
// Advance `it` to terminator or op with side-effects.
it = std::find_if(it, Block::iterator(), [](Operation &op) {
return isTerminator(&op) || hasSideEffects(&op);
});
// If `op` implements the AsyncOpInterface, add `token` to the list of async
// dependencies.
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
for (auto token : tokens)
asyncOp.addAsyncDependency(token);
return;
}
// Otherwise, insert a gpu.wait before 'it'.
builder.setInsertionPoint(it->getBlock(), it);
auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
// If the new waitOp is at the end of an async.execute region, add it to the
// worklist. 'operator()(executeOp)' would do the same, but this is faster.
auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
!it->getNextNode())
worklist.push_back(waitOp);
}
SmallVector<gpu::WaitOp, 8> worklist;
};
// Callback for `async.execute` ops which repeats !gpu.async.token results
// so that each of them is only used once.
struct GpuAsyncRegionPass::SingleTokenUseCallback {
void operator()(async::ExecuteOp executeOp) {
// Extract !gpu.async.token results which have multiple uses.
auto multiUseResults =
llvm::make_filter_range(executeOp.results(), [](OpResult result) {
if (result.use_empty() || result.hasOneUse())
return false;
auto valueType = result.getType().dyn_cast<async::ValueType>();
return valueType &&
valueType.getValueType().isa<gpu::AsyncTokenType>();
});
if (multiUseResults.empty())
return;
// Indices within !async.execute results (i.e. without the async.token).
SmallVector<int, 4> indices;
transform(multiUseResults, std::back_inserter(indices),
[](OpResult result) {
return result.getResultNumber() - 1; // Index without token.
});
for (auto index : indices) {
assert(!executeOp.results()[index].getUses().empty());
// Repeat async.yield token result, one for each use after the first one.
auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
auto count = std::distance(uses.begin(), uses.end());
auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
executeOp = addExecuteResults(executeOp, operands);
// Update 'uses' to refer to the new executeOp.
uses = llvm::drop_begin(executeOp.results()[index].getUses());
auto results = executeOp.results().take_back(count);
for (auto pair : llvm::zip(uses, results))
std::get<0>(pair).set(std::get<1>(pair));
}
}
};
// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.
void GpuAsyncRegionPass::runOnFunction() {
if (getFunction()->walk(ThreadTokenCallback(getContext())).wasInterrupted())
return signalPassFailure();
// Collect gpu.wait ops that we can move out of async.execute regions.
getFunction().getRegion().walk(DeferWaitCallback());
// Makes each !gpu.async.token returned from async.execute op have single use.
getFunction().getRegion().walk(SingleTokenUseCallback());
}
std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
return std::make_unique<GpuAsyncRegionPass>();
}