| //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the GPU dialect pattern rewriters that make GPU op |
| // within a region execute asynchronously. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "PassDetail.h" |
| #include "mlir/Dialect/Async/IR/Async.h" |
| #include "mlir/Dialect/GPU/GPUDialect.h" |
| #include "mlir/Dialect/GPU/Passes.h" |
| #include "mlir/Dialect/GPU/Utils.h" |
| #include "mlir/Dialect/StandardOps/IR/Ops.h" |
| #include "mlir/IR/BlockAndValueMapping.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/PatternMatch.h" |
| #include "mlir/IR/SymbolTable.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/RegionUtils.h" |
| #include "llvm/ADT/TypeSwitch.h" |
| |
| using namespace mlir; |
| namespace { |
| class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> { |
| struct ThreadTokenCallback; |
| struct DeferWaitCallback; |
| struct SingleTokenUseCallback; |
| void runOnFunction() override; |
| }; |
| } // namespace |
| |
| static bool isTerminator(Operation *op) { |
| return op->mightHaveTrait<OpTrait::IsTerminator>(); |
| } |
| static bool hasSideEffects(Operation *op) { |
| return !MemoryEffectOpInterface::hasNoEffect(op); |
| } |
| |
| // Region walk callback which makes GPU ops implementing the AsyncOpInterface |
| // execute asynchronously. |
| struct GpuAsyncRegionPass::ThreadTokenCallback { |
| ThreadTokenCallback(MLIRContext &context) : builder(&context) {} |
| |
| WalkResult operator()(Block *block) { |
| for (Operation &op : make_early_inc_range(*block)) { |
| if (failed(visit(&op))) |
| return WalkResult::interrupt(); |
| } |
| return WalkResult::advance(); |
| } |
| |
| private: |
| // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to |
| // create a current token (unless it already exists), and 'thread' that token |
| // through the `op` so that it executes asynchronously. |
| // |
| // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to |
| // host-synchronize execution. A `!gpu.async.token` will therefore only be |
| // used inside of its block and GPU execution will always synchronize with |
| // the host at block boundaries. |
| LogicalResult visit(Operation *op) { |
| if (isa<gpu::LaunchOp>(op)) |
| return op->emitOpError("replace with gpu.launch_func first"); |
| if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) { |
| if (currentToken) |
| waitOp.addAsyncDependency(currentToken); |
| currentToken = waitOp.asyncToken(); |
| return success(); |
| } |
| builder.setInsertionPoint(op); |
| if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op)) |
| return rewriteAsyncOp(asyncOp); // Replace GPU op with async version. |
| if (!currentToken) |
| return success(); |
| // Insert host synchronization before terminator or op with side effects. |
| if (isTerminator(op) || hasSideEffects(op)) |
| currentToken = createWaitOp(op->getLoc(), Type(), {currentToken}); |
| return success(); |
| } |
| |
| // Replaces asyncOp with a clone that returns a token. |
| LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) { |
| auto *op = asyncOp.getOperation(); |
| auto tokenType = builder.getType<gpu::AsyncTokenType>(); |
| |
| // If there is no current token, insert a `gpu.wait async` without |
| // dependencies to create one. |
| if (!currentToken) |
| currentToken = createWaitOp(op->getLoc(), tokenType, {}); |
| asyncOp.addAsyncDependency(currentToken); |
| |
| // Return early if op returns a token already. |
| currentToken = asyncOp.getAsyncToken(); |
| if (currentToken) |
| return success(); |
| |
| // Clone the op to return a token in addition to the other results. |
| SmallVector<Type, 1> resultTypes; |
| resultTypes.reserve(1 + op->getNumResults()); |
| copy(op->getResultTypes(), std::back_inserter(resultTypes)); |
| resultTypes.push_back(tokenType); |
| auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes, |
| op->getOperands(), op->getAttrDictionary(), |
| op->getSuccessors(), op->getNumRegions()); |
| |
| // Clone regions into new op. |
| BlockAndValueMapping mapping; |
| for (auto pair : llvm::zip_first(op->getRegions(), newOp->getRegions())) |
| std::get<0>(pair).cloneInto(&std::get<1>(pair), mapping); |
| |
| // Replace the op with the async clone. |
| auto results = newOp->getResults(); |
| currentToken = results.back(); |
| builder.insert(newOp); |
| op->replaceAllUsesWith(results.drop_back()); |
| op->erase(); |
| |
| return success(); |
| } |
| |
| Value createWaitOp(Location loc, Type resultType, ValueRange operands) { |
| return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken(); |
| } |
| |
| OpBuilder builder; |
| |
| // The token that represents the current asynchronous dependency. It's valid |
| // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op. |
| // In between, each gpu::AsyncOpInterface depends on the current token and |
| // produces the new one. |
| Value currentToken = {}; |
| }; |
| |
| /// Erases `executeOp` and returns a clone with additional `results`. |
| async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp, |
| ValueRange results) { |
| // Add values to async.yield op. |
| Operation *yieldOp = executeOp.getBody()->getTerminator(); |
| yieldOp->insertOperands(yieldOp->getNumOperands(), results); |
| |
| // Construct new result type list with additional types. |
| SmallVector<Type, 2> resultTypes; |
| resultTypes.reserve(executeOp.getNumResults() + results.size()); |
| transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), |
| [](Type type) { |
| // Extract value type from !async.value. |
| if (auto valueType = type.dyn_cast<async::ValueType>()) |
| return valueType.getValueType(); |
| assert(type.isa<async::TokenType>() && "expected token type"); |
| return type; |
| }); |
| transform(results, std::back_inserter(resultTypes), |
| [](Value value) { return value.getType(); }); |
| |
| // Clone executeOp with the extra results. |
| OpBuilder builder(executeOp); |
| auto newOp = builder.create<async::ExecuteOp>( |
| executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, |
| executeOp.dependencies(), executeOp.operands()); |
| BlockAndValueMapping mapper; |
| newOp.getRegion().getBlocks().clear(); |
| executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); |
| |
| // Replace executeOp with cloned one. |
| executeOp.getOperation()->replaceAllUsesWith( |
| newOp.getResults().drop_back(results.size())); |
| executeOp.erase(); |
| |
| return newOp; |
| } |
| |
| // Callback for `async.execute` ops which tries to push the contained |
| // synchronous `gpu.wait` op to the dependencies of the `async.execute`. |
| struct GpuAsyncRegionPass::DeferWaitCallback { |
| // If the `executeOp`s token is used only in `async.execute` or `async.await` |
| // ops, add the region's last `gpu.wait` op to the worklist if it is |
| // synchronous and is the last op with side effects. |
| void operator()(async::ExecuteOp executeOp) { |
| if (!areAllUsersExecuteOrAwait(executeOp.token())) |
| return; |
| // async.execute's region is currently restricted to one block. |
| for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) { |
| if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) { |
| if (!waitOp.asyncToken()) |
| worklist.push_back(waitOp); |
| return; |
| } |
| if (hasSideEffects(&op)) |
| return; |
| } |
| } |
| |
| // The destructor performs the actual rewrite work. |
| ~DeferWaitCallback() { |
| for (size_t i = 0; i < worklist.size(); ++i) { |
| auto waitOp = worklist[i]; |
| auto executeOp = waitOp->getParentOfType<async::ExecuteOp>(); |
| |
| // Erase `gpu.wait` and return async dependencies from execute op instead. |
| SmallVector<Value, 4> dependencies = waitOp.asyncDependencies(); |
| waitOp.erase(); |
| executeOp = addExecuteResults(executeOp, dependencies); |
| |
| // Add the async dependency to each user of the `async.execute` token. |
| auto asyncTokens = executeOp.getResults().take_back(dependencies.size()); |
| for (Operation *user : executeOp.token().getUsers()) |
| addAsyncDependencyAfter(asyncTokens, user); |
| } |
| } |
| |
| private: |
| // Returns whether all token users are either 'async.execute' or 'async.await' |
| // ops. This is used as a requirement for pushing 'gpu.wait' ops from a |
| // 'async.execute' body to it's users. Specifically, we do not allow |
| // terminator users, because it could mean that the `async.execute` is inside |
| // control flow code. |
| static bool areAllUsersExecuteOrAwait(Value token) { |
| return !token.use_empty() && |
| llvm::all_of(token.getUsers(), [](Operation *user) { |
| return isa<async::ExecuteOp, async::AwaitOp>(user); |
| }); |
| } |
| |
| // Add the `asyncToken` as dependency as needed after `op`. |
| void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) { |
| OpBuilder builder(op->getContext()); |
| auto loc = op->getLoc(); |
| |
| Block::iterator it; |
| SmallVector<Value, 1> tokens; |
| tokens.reserve(asyncTokens.size()); |
| TypeSwitch<Operation *>(op) |
| .Case<async::AwaitOp>([&](auto awaitOp) { |
| // Add async.await ops to wait for the !gpu.async.tokens. |
| builder.setInsertionPointAfter(op); |
| for (auto asyncToken : asyncTokens) |
| tokens.push_back( |
| builder.create<async::AwaitOp>(loc, asyncToken).result()); |
| // Set `it` after the inserted async.await ops. |
| it = builder.getInsertionPoint(); |
| }) |
| .Case<async::ExecuteOp>([&](auto executeOp) { |
| // Set `it` to the beginning of the region and add asyncTokens to the |
| // async.execute operands. |
| it = executeOp.getBody()->begin(); |
| executeOp.operandsMutable().append(asyncTokens); |
| SmallVector<Type, 1> tokenTypes( |
| asyncTokens.size(), builder.getType<gpu::AsyncTokenType>()); |
| copy(executeOp.getBody()->addArguments(tokenTypes), |
| std::back_inserter(tokens)); |
| }); |
| |
| // Advance `it` to terminator or op with side-effects. |
| it = std::find_if(it, Block::iterator(), [](Operation &op) { |
| return isTerminator(&op) || hasSideEffects(&op); |
| }); |
| |
| // If `op` implements the AsyncOpInterface, add `token` to the list of async |
| // dependencies. |
| if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) { |
| for (auto token : tokens) |
| asyncOp.addAsyncDependency(token); |
| return; |
| } |
| |
| // Otherwise, insert a gpu.wait before 'it'. |
| builder.setInsertionPoint(it->getBlock(), it); |
| auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens); |
| |
| // If the new waitOp is at the end of an async.execute region, add it to the |
| // worklist. 'operator()(executeOp)' would do the same, but this is faster. |
| auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp()); |
| if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) && |
| !it->getNextNode()) |
| worklist.push_back(waitOp); |
| } |
| |
| SmallVector<gpu::WaitOp, 8> worklist; |
| }; |
| |
| // Callback for `async.execute` ops which repeats !gpu.async.token results |
| // so that each of them is only used once. |
| struct GpuAsyncRegionPass::SingleTokenUseCallback { |
| void operator()(async::ExecuteOp executeOp) { |
| // Extract !gpu.async.token results which have multiple uses. |
| auto multiUseResults = |
| llvm::make_filter_range(executeOp.results(), [](OpResult result) { |
| if (result.use_empty() || result.hasOneUse()) |
| return false; |
| auto valueType = result.getType().dyn_cast<async::ValueType>(); |
| return valueType && |
| valueType.getValueType().isa<gpu::AsyncTokenType>(); |
| }); |
| if (multiUseResults.empty()) |
| return; |
| |
| // Indices within !async.execute results (i.e. without the async.token). |
| SmallVector<int, 4> indices; |
| transform(multiUseResults, std::back_inserter(indices), |
| [](OpResult result) { |
| return result.getResultNumber() - 1; // Index without token. |
| }); |
| |
| for (auto index : indices) { |
| assert(!executeOp.results()[index].getUses().empty()); |
| // Repeat async.yield token result, one for each use after the first one. |
| auto uses = llvm::drop_begin(executeOp.results()[index].getUses()); |
| auto count = std::distance(uses.begin(), uses.end()); |
| auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator()); |
| SmallVector<Value, 4> operands(count, yieldOp.getOperand(index)); |
| executeOp = addExecuteResults(executeOp, operands); |
| // Update 'uses' to refer to the new executeOp. |
| uses = llvm::drop_begin(executeOp.results()[index].getUses()); |
| auto results = executeOp.results().take_back(count); |
| for (auto pair : llvm::zip(uses, results)) |
| std::get<0>(pair).set(std::get<1>(pair)); |
| } |
| } |
| }; |
| |
| // Replaces synchronous GPU ops in the op's region with asynchronous ones and |
| // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential |
| // execution semantics and that no GPU ops are asynchronous yet. |
| void GpuAsyncRegionPass::runOnFunction() { |
| if (getFunction()->walk(ThreadTokenCallback(getContext())).wasInterrupted()) |
| return signalPassFailure(); |
| |
| // Collect gpu.wait ops that we can move out of async.execute regions. |
| getFunction().getRegion().walk(DeferWaitCallback()); |
| // Makes each !gpu.async.token returned from async.execute op have single use. |
| getFunction().getRegion().walk(SingleTokenUseCallback()); |
| } |
| |
| std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() { |
| return std::make_unique<GpuAsyncRegionPass>(); |
| } |