blob: 27a34695decb2322669791ecf11aa3578862cb7a [file] [edit]
//===- ACCRoutineToGPUFunc.cpp - Move ACC routines to GPU module ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The OpenACC `routine` directive defines functions that may be invoked from
// device code. Those functions need to be available in the device compilation
// unit. This pass moves materialized acc routines into the GPU module as
// gpu.func operations so they can be compiled for the device.
//
// Overview:
// ---------
// For each acc.routine that is not bound by name, the corresponding
// specialized function (created by ACCRoutineLowering) or the original
// host function (in case of seq) is cloned into theGPU module as a gpu.func.
// Callees referenced from those routines are processed: device-valid callees
// (runtime, intrinsics, other acc routines) are added to the GPU module as
// declarations or full clones as needed. Bind-name routines are not moved;
// their acc.routine ops are erased. After cloning, the host copies of
// specialized device functions and nohost routines are removed.
//
// Approach:
// ----------------
// 1. Collect materialized routines (acc.routine without bind(name)); record
// bind-name routines for erasure. Emit remarks for materialized routines.
//
// 2. Process calls: walk each materialized function; for each call, if the
// callee is already in the GPU module or is an acc routine (or specialized
// acc routine), skip; otherwise require OpenACCSupport::isValidSymbolUse.
// Valid callees are added to the clone set (as declaration or full clone).
//
// 3. Clone into GPU module: each function in the clone set is turned into a
// gpu.func (body cloned or declaration only). acc.specialized_routine is
// preserved and symbol uses are updated so the routine name is unchanged.
//
// 4. Cleanup: erase from the host module the specialized device function
// bodies and any nohost routine (host copy removed after move to device).
//
// Example:
// --------
// Before (after ACCRoutineLowering):
// acc.routine @r_seq func(@foo) seq
// func.func @foo() attributes {acc.specialized_routine = ...} { ... }
//
// After:
// acc.routine @r_seq func(@foo) seq
// gpu.module @acc_gpu_module {
// gpu.func @foo() attributes {acc.specialized_routine = ...} { ... }
// }
// (host @foo erased)
//
// Requirements:
// -------------
// - Must run after `ACCRoutineLowering` pass which ensures variants for all
// levels of parallelism are created.
// - Uses OpenACCSupport: getOrCreateGPUModule, isValidSymbolUse, emitRemark,
// emitNYI. If no custom implementation is registered, the default is used.
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/CallInterfaces.h"
#include "llvm/ADT/SetVector.h"
#include <string>
namespace mlir {
namespace acc {
#define GEN_PASS_DEF_ACCROUTINETOGPUFUNC
#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
} // namespace acc
} // namespace mlir
#define DEBUG_TYPE "acc-routine-to-gpu-func"
using namespace mlir;
using namespace mlir::acc;
namespace {
/// Create a gpu.func from a func.func by cloning the body.
static gpu::GPUFuncOp createGPUFuncFromFunc(OpBuilder &builder,
func::FuncOp sourceFunc) {
Location loc = sourceFunc.getLoc();
StringRef name = sourceFunc.getName();
FunctionType type = sourceFunc.getFunctionType();
// Do not copy any attributes from the source; specialized_routine is set
// later when applicable.
gpu::GPUFuncOp gpuFunc =
gpu::GPUFuncOp::create(builder, loc, name, type,
/*workgroupAttributions=*/TypeRange(),
/*privateAttributions=*/TypeRange(), /*attrs=*/{});
Region &sourceBody = sourceFunc.getBody();
Region &deviceBody = gpuFunc.getBody();
Block &deviceEntryBlock = deviceBody.front();
// Map source block arguments to the GPU func's entry block arguments (which
// GPUFuncOp::create already created).
IRMapping mapping;
Block &sourceEntryBlock = sourceBody.front();
for (auto [srcArg, destArg] : llvm::zip(sourceEntryBlock.getArguments(),
deviceEntryBlock.getArguments()))
mapping.map(srcArg, destArg);
sourceBody.cloneInto(&deviceBody, mapping);
// Replace func.return with gpu.return in the cloned blocks.
gpuFunc.walk([](func::ReturnOp op) {
OpBuilder replacer(op);
gpu::ReturnOp gpuReturn = gpu::ReturnOp::create(replacer, op.getLoc());
gpuReturn->setOperands(op.getOperands());
op.erase();
});
// Splice the cloned entry block's operations into the GPU func's entry block
// (cloneInto created a separate block for the cloned content), then remove
// the now-empty cloned block.
Block *clonedSourceEntry = mapping.lookup(&sourceEntryBlock);
deviceEntryBlock.getOperations().splice(
deviceEntryBlock.getOperations().end(),
clonedSourceEntry->getOperations());
clonedSourceEntry->erase();
return gpuFunc;
}
using CloneCandidate = std::pair<func::FuncOp, RoutineOp>;
/// Collect materialized and bind routines; fill candidate func names and
/// materialized routine set. Emit remarks for materialized routines.
static void collectRoutineCandidates(
ModuleOp mod, SymbolTable &symTab, acc::DeviceType deviceType,
OpenACCSupport &accSupport,
llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
llvm::SmallSetVector<RoutineOp, 4> &bindAccRoutines) {
auto isParallelRoutine = [deviceType](RoutineOp routineOp) {
return routineOp.hasGang(deviceType) || routineOp.hasGang() ||
routineOp.hasWorker(deviceType) || routineOp.hasWorker() ||
routineOp.hasVector(deviceType) || routineOp.hasVector() ||
routineOp.getGangDimValue(deviceType) || routineOp.getGangDimValue();
};
mod.walk([&](RoutineOp op) {
if (op.getBindNameValue() || op.getBindNameValue(deviceType)) {
bindAccRoutines.insert(op);
return;
}
func::FuncOp callee =
symTab.lookup<func::FuncOp>(op.getFuncName().getLeafReference());
accSupport.emitRemark(
callee ? callee.getOperation() : op.getOperation(),
[&op, &isParallelRoutine]() {
std::string msg = "Generating";
if (op.getImplicitAttr())
msg += " implicit";
msg += " acc routine";
if (!isParallelRoutine(op))
msg += " seq";
return msg;
},
DEBUG_TYPE);
funcsToCloneCandidates.insert(op.getFuncName().getLeafReference());
materializedAccRoutines.insert(op);
});
}
/// Process calls in ACC routines: add valid callees to funcsToClone (for
/// declaration or clone). Returns failure() if any call is unsupported.
static LogicalResult processCallsInRoutines(
SymbolTable &symTab, SymbolTable &gpuSymTab, OpenACCSupport &accSupport,
const llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
const llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
LogicalResult callCheckResult = success();
auto processCalls = [&](CallOpInterface callOp) {
if (!callOp.getCallableForCallee())
return;
auto calleeSymbolRef =
dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
if (!calleeSymbolRef)
return;
auto callee =
symTab.lookup<func::FuncOp>(calleeSymbolRef.getLeafReference());
if (!callee)
return;
if (gpuSymTab.lookup(callee.getName()))
return;
if (isAccRoutine(callee) || isSpecializedAccRoutine(callee))
return;
if (!accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef)) {
accSupport.emitNYI(callOp->getLoc(), "Unsupported call in acc routine");
callCheckResult = failure();
return;
}
funcsToClone.insert({callee, RoutineOp{}});
};
for (auto [funcName, accRoutine] :
llvm::zip(funcsToCloneCandidates, materializedAccRoutines)) {
func::FuncOp func = symTab.lookup<func::FuncOp>(funcName);
if (!func)
continue;
if (!gpuSymTab.lookup(funcName))
funcsToClone.insert({func, accRoutine});
func.walk([&](CallOpInterface callOp) { processCalls(callOp); });
if (failed(callCheckResult))
return failure();
}
return success();
}
/// Clone each function in funcsToClone into the GPU module (declaration or
/// full body). Fix up symbol names and specialized_routine attr for ACC
/// routines.
static LogicalResult cloneFuncsToGPUModule(
ModuleOp mod, OpenACCSupport &accSupport, SymbolTable &gpuSymTab,
const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
MLIRContext *ctx = mod.getContext();
OpBuilder builder(ctx);
for (CloneCandidate candidate : funcsToClone) {
func::FuncOp srcFunc = candidate.first;
if (srcFunc.isDeclaration()) {
Operation *cloned = srcFunc->clone();
gpuSymTab.insert(cloned);
continue;
}
gpu::GPUFuncOp deviceFuncOp = createGPUFuncFromFunc(builder, srcFunc);
if (auto specRoutineAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
getSpecializedRoutineAttrName())) {
StringAttr funcName = specRoutineAttr.getFuncName();
if (failed(SymbolTable::replaceAllSymbolUses(
StringAttr::get(ctx, deviceFuncOp.getName()), funcName, mod))) {
accSupport.emitNYI(deviceFuncOp.getLoc(),
"cannot replace symbol for acc routine");
return failure();
}
deviceFuncOp->setAttr(SymbolTable::getSymbolAttrName(), funcName);
}
if (auto specAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
getSpecializedRoutineAttrName()))
deviceFuncOp->setAttr(getSpecializedRoutineAttrName(), specAttr);
gpuSymTab.insert(deviceFuncOp);
}
return success();
}
/// Remove specialized device copies and nohost routines from the host module.
static void
cleanupHostModule(const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
for (CloneCandidate candidate : funcsToClone) {
func::FuncOp funcCandidate = candidate.first;
RoutineOp routineCandidate = candidate.second;
if ((routineCandidate && routineCandidate.getNohost()) ||
acc::isSpecializedAccRoutine(funcCandidate))
funcCandidate.erase();
}
}
class ACCRoutineToGPUFunc
: public acc::impl::ACCRoutineToGPUFuncBase<ACCRoutineToGPUFunc> {
public:
using acc::impl::ACCRoutineToGPUFuncBase<
ACCRoutineToGPUFunc>::ACCRoutineToGPUFuncBase;
void runOnOperation() override {
ModuleOp mod = getOperation();
if (mod.getOps<RoutineOp>().empty()) {
LLVM_DEBUG(llvm::dbgs()
<< "Skipping ACCRoutineToGPUFunc - no acc.routine ops\n");
return;
}
OpenACCSupport &accSupport = getAnalysis<OpenACCSupport>();
std::optional<gpu::GPUModuleOp> gpuModOpt =
accSupport.getOrCreateGPUModule(mod);
if (!gpuModOpt) {
accSupport.emitNYI(mod.getLoc(), "Failed to create GPU module");
return signalPassFailure();
}
gpu::GPUModuleOp gpuMod = *gpuModOpt;
SymbolTable symTab(mod);
SymbolTable gpuSymTab(gpuMod);
llvm::SmallSetVector<llvm::StringRef, 4> funcsToCloneCandidates;
llvm::SmallSetVector<RoutineOp, 4> materializedAccRoutines;
llvm::SmallSetVector<RoutineOp, 4> bindAccRoutines;
collectRoutineCandidates(mod, symTab, this->deviceType, accSupport,
funcsToCloneCandidates, materializedAccRoutines,
bindAccRoutines);
llvm::SmallSetVector<CloneCandidate, 4> funcsToClone;
if (failed(processCallsInRoutines(symTab, gpuSymTab, accSupport,
funcsToCloneCandidates,
materializedAccRoutines, funcsToClone)))
return signalPassFailure();
if (failed(cloneFuncsToGPUModule(mod, accSupport, gpuSymTab, funcsToClone)))
return signalPassFailure();
cleanupHostModule(funcsToClone);
for (RoutineOp bindOp : bindAccRoutines)
bindOp.erase();
}
};
} // namespace