| //===-- CUFDeviceFuncTransform.cpp ----------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "flang/Optimizer/Builder/CUFCommon.h" |
| #include "flang/Optimizer/Builder/Todo.h" |
| #include "flang/Optimizer/Dialect/CUF/CUFOps.h" |
| #include "flang/Optimizer/Dialect/FIRAttr.h" |
| #include "flang/Optimizer/Dialect/FIRDialect.h" |
| #include "flang/Optimizer/Dialect/FIROpsSupport.h" |
| #include "flang/Optimizer/Support/InternalNames.h" |
| #include "flang/Optimizer/Transforms/Passes.h" |
| #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/Index/IR/IndexDialect.h" |
| #include "mlir/Dialect/Index/IR/IndexOps.h" |
| #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| #include "mlir/Dialect/LLVMIR/NVVMDialect.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/IR/IRMapping.h" |
| #include "mlir/Pass/Pass.h" |
| #include "mlir/Transforms/RegionUtils.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/StringSet.h" |
| |
| namespace fir { |
| #define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM |
| #include "flang/Optimizer/Transforms/Passes.h.inc" |
| } // namespace fir |
| |
| using namespace mlir; |
| |
| namespace { |
| |
| class CUFDeviceFuncTransform |
| : public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> { |
| using CUFDeviceFuncTransformBase< |
| CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase; |
| |
| static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp, |
| bool isGlobal, int computeCap) { |
| mlir::OpBuilder builder(funcOp.getContext()); |
| |
| mlir::Region &funcOpBody = funcOp.getBody(); |
| SetVector<Value> operands; |
| for (mlir::Value operand : funcOp.getArguments()) |
| operands.insert(operand); |
| |
| llvm::SmallVector<mlir::Type> funcOperandTypes; |
| llvm::SmallVector<mlir::Type> funcResultTypes; |
| funcOperandTypes.reserve(funcOp.getArgumentTypes().size()); |
| funcResultTypes.reserve(funcOp.getResultTypes().size()); |
| for (mlir::Type opTy : funcOp.getArgumentTypes()) |
| funcOperandTypes.push_back(opTy); |
| for (mlir::Type resTy : funcOp.getResultTypes()) |
| funcResultTypes.push_back(resTy); |
| |
| mlir::Location loc = funcOp.getLoc(); |
| |
| mlir::FunctionType type = mlir::FunctionType::get( |
| funcOp.getContext(), funcOperandTypes, funcResultTypes); |
| |
| auto deviceFuncOp = |
| gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type, |
| mlir::TypeRange{}, mlir::TypeRange{}); |
| if (isGlobal) |
| deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), |
| builder.getUnitAttr()); |
| |
| mlir::Region &deviceFuncBody = deviceFuncOp.getBody(); |
| mlir::Block &entryBlock = deviceFuncBody.front(); |
| |
| mlir::IRMapping map; |
| for (const auto &operand : enumerate(operands)) |
| map.map(operand.value(), entryBlock.getArgument(operand.index())); |
| |
| funcOpBody.cloneInto(&deviceFuncBody, map); |
| |
| deviceFuncOp.walk([](func::ReturnOp op) { |
| mlir::OpBuilder replacer(op); |
| gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc()); |
| gpuReturnOp->setOperands(op.getOperands()); |
| op.erase(); |
| }); |
| |
| mlir::Block &funcOpEntry = funcOp.front(); |
| mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry); |
| |
| entryBlock.getOperations().splice(entryBlock.getOperations().end(), |
| clonedFuncOpEntry->getOperations()); |
| clonedFuncOpEntry->erase(); |
| |
| auto launchBoundsAttr = |
| funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>( |
| cuf::getLaunchBoundsAttrName()); |
| if (launchBoundsAttr) { |
| auto maxTPB = launchBoundsAttr.getMaxTPB().getInt(); |
| auto maxntid = |
| builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1}); |
| deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid); |
| deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(), |
| launchBoundsAttr.getMinBPM()); |
| if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize()) |
| deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(), |
| launchBoundsAttr.getUpperBoundClusterSize()); |
| } |
| |
| return deviceFuncOp; |
| } |
| |
| static void createHostStub(mlir::func::FuncOp funcOp, |
| mlir::SymbolTable &symTab, mlir::ModuleOp mod) { |
| mlir::Location loc = funcOp.getLoc(); |
| mlir::OpBuilder modBuilder(mod.getBodyRegion()); |
| modBuilder.setInsertionPointToEnd(mod.getBody()); |
| auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(), |
| funcOp.getFunctionType()); |
| emptyStub.setVisibility(funcOp.getVisibility()); |
| emptyStub->setAttrs(funcOp->getAttrs()); |
| auto entryBlock = emptyStub.addEntryBlock(); |
| modBuilder.setInsertionPointToEnd(entryBlock); |
| func::ReturnOp::create(modBuilder, loc); |
| |
| symTab.erase(funcOp); |
| symTab.insert(emptyStub); |
| } |
| |
| static bool isDeviceFunc(mlir::func::FuncOp funcOp) { |
| if (auto cudaProcAttr = |
| funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>( |
| cuf::getProcAttrName())) |
| if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device || |
| cudaProcAttr.getValue() == cuf::ProcAttribute::Global || |
| cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal || |
| cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice) |
| return true; |
| return false; |
| } |
| |
| void runOnOperation() override { |
| // Working on Module operation because inserting/removing function from the |
| // module is not thread-safe. |
| ModuleOp mod = getOperation(); |
| mlir::SymbolTable symbolTable(getOperation()); |
| |
| auto *ctx = getOperation().getContext(); |
| mlir::OpBuilder builder(ctx); |
| |
| gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable); |
| mlir::SymbolTable gpuModSymTab(gpuMod); |
| |
| llvm::SetVector<mlir::func::FuncOp> funcsToClone; |
| llvm::SetVector<mlir::func::FuncOp> deviceFuncs; |
| llvm::SetVector<mlir::func::FuncOp> keepInModule; |
| llvm::StringSet<> deviceFuncNames; |
| |
| // Look for all function to migrate to the GPU module. |
| mod.walk([&](mlir::func::FuncOp op) { |
| if (isDeviceFunc(op)) { |
| deviceFuncs.insert(op); |
| deviceFuncNames.insert(op.getSymName()); |
| } |
| }); |
| |
| auto processCallOp = [&](fir::CallOp op) { |
| if (op.getCallee()) { |
| auto func = symbolTable.lookup<mlir::func::FuncOp>( |
| op.getCallee()->getLeafReference()); |
| if (deviceFuncs.count(func) == 0) |
| funcsToClone.insert(func); |
| } |
| }; |
| |
| // Gather all function called by device functions. |
| for (auto funcOp : deviceFuncs) { |
| funcOp.walk([&](fir::CallOp op) { processCallOp(op); }); |
| funcOp.walk([&](fir::DispatchOp op) { |
| TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch " |
| "in device procedure"); |
| }); |
| } |
| |
| // Functions that are referenced in a derived-type binding table must be |
| // kept in the host module to avoid LLVM dialect verification errors. |
| for (auto globalOp : mod.getOps<fir::GlobalOp>()) { |
| if (globalOp.getName().contains(fir::kBindingTableSeparator)) { |
| globalOp.walk([&](fir::AddrOfOp addrOfOp) { |
| if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference())) |
| keepInModule.insert( |
| *llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) { |
| return f.getSymName() == |
| addrOfOp.getSymbol().getLeafReference(); |
| })); |
| }); |
| } |
| } |
| |
| // Gather all functions called by CUF kernels. |
| mod.walk([&](cuf::KernelOp kernelOp) { |
| kernelOp.walk([&](fir::CallOp op) { processCallOp(op); }); |
| kernelOp.walk([&](fir::DispatchOp op) { |
| TODO(op.getLoc(), |
| "type-bound procedure call with dynamic dispatch in cuf kernel"); |
| }); |
| }); |
| |
| for (auto funcOp : funcsToClone) |
| gpuModSymTab.insert(funcOp->clone()); |
| |
| for (auto funcOp : deviceFuncs) { |
| auto cudaProcAttr = |
| funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>( |
| cuf::getProcAttrName()); |
| auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global || |
| cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal; |
| if (funcOp.isDeclaration()) { |
| mlir::Operation *clonedFuncOp = funcOp->clone(); |
| if (isGlobal) { |
| clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), |
| builder.getUnitAttr()); |
| clonedFuncOp->removeAttr(cuf::getProcAttrName()); |
| } |
| gpuModSymTab.insert(clonedFuncOp); |
| } else { |
| gpu::GPUFuncOp deviceFuncOp = |
| createGPUFuncOp(funcOp, isGlobal, computeCap); |
| gpuModSymTab.insert(deviceFuncOp); |
| |
| if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) { |
| // If the function is a global, we need to keep the host side |
| // declaration for the kernel registration. Currently we just |
| // erase its body but in the future, the body should be rewritten |
| // to be able to launch CUDA Fortran kernel from C code. |
| if (isGlobal || keepInModule.contains(funcOp)) |
| createHostStub(funcOp, symbolTable, mod); |
| else |
| funcOp.erase(); |
| } |
| } |
| } |
| } |
| }; |
| |
| } // end anonymous namespace |