blob: ef6b80b5739c6c7b8c68796280dd307ca79f5e71 [file] [log] [blame]
//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Support/Fortran.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"
namespace fir {
#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir
using namespace fir;
using namespace mlir;
using namespace Fortran::runtime;
namespace {
static mlir::Value createKernelArgArray(mlir::Location loc,
mlir::ValueRange operands,
mlir::PatternRewriter &rewriter) {
auto *ctx = rewriter.getContext();
llvm::SmallVector<mlir::Type> structTypes(operands.size(), nullptr);
for (auto [i, arg] : llvm::enumerate(operands))
structTypes[i] = arg.getType();
auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
mlir::Type i32Ty = rewriter.getI32Type();
auto zero = rewriter.create<mlir::LLVM::ConstantOp>(
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
auto one = rewriter.create<mlir::LLVM::ConstantOp>(
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1));
mlir::Value argStruct =
rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
auto size = rewriter.create<mlir::LLVM::ConstantOp>(
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
mlir::Value argArray =
rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
for (auto [i, arg] : llvm::enumerate(operands)) {
auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
loc, ptrTy, structTy, argStruct,
mlir::ArrayRef<mlir::Value>({zero, indice}));
rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
loc, ptrTy, ptrTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
}
return argArray;
}
struct GPULaunchKernelConversion
: public mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp> {
explicit GPULaunchKernelConversion(
const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
: mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp>(typeConverter,
benefit) {}
using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor;
mlir::LogicalResult
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
// Only convert gpu.launch_func for CUDA Fortran.
if (!op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName()))
return mlir::failure();
mlir::Location loc = op.getLoc();
auto *ctx = rewriter.getContext();
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
mlir::Type i32Ty = rewriter.getI32Type();
if (!dynamicMemorySize)
dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
mlir::Value kernelArgs =
createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
auto kernel = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(op.getKernelName());
mlir::Value kernelPtr;
if (!kernel) {
auto funcOp = mod.lookupSymbol<mlir::func::FuncOp>(op.getKernelName());
if (!funcOp)
return mlir::failure();
kernelPtr =
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
} else {
kernelPtr =
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
}
auto llvmIntPtrType = mlir::IntegerType::get(
ctx, this->getTypeConverter()->getPointerBitwidth(0));
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
if (op.hasClusterSize()) {
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
RTNAME_STRING(CUFLaunchClusterKernel));
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
mlir::Value stream = nullPtr;
if (!adaptor.getAsyncDependencies().empty()) {
if (adaptor.getAsyncDependencies().size() != 1)
return rewriter.notifyMatchFailure(
op, "Can only convert with exactly one stream dependency.");
stream = adaptor.getAsyncDependencies().front();
}
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchClusterKernel,
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
adaptor.getGridSizeX(), adaptor.getGridSizeY(),
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
stream, dynamicMemorySize, kernelArgs, nullPtr});
} else {
auto procAttr =
op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
bool isGridGlobal =
procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
llvm::StringRef fctName = isGridGlobal
? RTNAME_STRING(CUFLaunchCooperativeKernel)
: RTNAME_STRING(CUFLaunchKernel);
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
ptrTy},
/*isVarArg=*/false);
auto cufLaunchKernel =
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
if (!funcOp) {
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
rewriter.setInsertionPointToStart(mod.getBody());
auto launchKernelFuncOp =
rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
mlir::Value stream = nullPtr;
if (!adaptor.getAsyncDependencies().empty()) {
if (adaptor.getAsyncDependencies().size() != 1)
return rewriter.notifyMatchFailure(
op, "Can only convert with exactly one stream dependency.");
stream = adaptor.getAsyncDependencies().front();
}
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchKernel,
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
adaptor.getBlockSizeZ(), stream, dynamicMemorySize,
kernelArgs, nullPtr});
}
return mlir::success();
}
};
static std::string getFuncName(cuf::SharedMemoryOp op) {
if (auto gpuFuncOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
return gpuFuncOp.getName().str();
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>())
return funcOp.getName().str();
if (auto llvmFuncOp = op->getParentOfType<mlir::LLVM::LLVMFuncOp>())
return llvmFuncOp.getSymName().str();
return "";
}
static mlir::Value createAddressOfOp(mlir::ConversionPatternRewriter &rewriter,
mlir::Location loc,
gpu::GPUModuleOp gpuMod,
std::string &sharedGlobalName) {
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(
rewriter.getContext(), mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
if (auto g = gpuMod.lookupSymbol<fir::GlobalOp>(sharedGlobalName))
return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
g.getSymName());
if (auto g = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(sharedGlobalName))
return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
g.getSymName());
return {};
}
struct CUFSharedMemoryOpConversion
: public mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp> {
explicit CUFSharedMemoryOpConversion(
const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
: mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp>(typeConverter,
benefit) {}
using OpAdaptor = typename cuf::SharedMemoryOp::Adaptor;
mlir::LogicalResult
matchAndRewrite(cuf::SharedMemoryOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
mlir::Location loc = op->getLoc();
if (!op.getOffset())
mlir::emitError(loc,
"cuf.shared_memory must have an offset for code gen");
auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
std::string sharedGlobalName =
(getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
mlir::Value sharedGlobalAddr =
createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
if (!sharedGlobalAddr)
mlir::emitError(loc, "Could not find the shared global operation\n");
auto castPtr = rewriter.create<mlir::LLVM::AddrSpaceCastOp>(
loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
sharedGlobalAddr);
mlir::Type baseType = castPtr->getResultTypes().front();
llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {op.getOffset()};
mlir::Value shmemPtr = rewriter.create<mlir::LLVM::GEPOp>(
loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
rewriter.replaceOp(op, {shmemPtr});
return mlir::success();
}
};
struct CUFStreamCastConversion
: public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
mlir::PatternBenefit benefit)
: mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
benefit) {}
using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
mlir::LogicalResult
matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOp(op, adaptor.getStream());
return mlir::success();
}
};
class CUFGPUToLLVMConversion
: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
public:
void runOnOperation() override {
auto *ctx = &getContext();
mlir::RewritePatternSet patterns(ctx);
mlir::ConversionTarget target(*ctx);
mlir::Operation *op = getOperation();
mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
if (!module)
return signalPassFailure();
std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
module, /*allowDefaultLayout=*/false);
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
/*forceUnifiedTBAATree=*/false, *dl);
cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
target.addDynamicallyLegalOp<mlir::gpu::LaunchFuncOp>(
[&](mlir::gpu::LaunchFuncOp op) {
if (op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName()))
return false;
return true;
});
target.addIllegalOp<cuf::SharedMemoryOp>();
target.addLegalDialect<mlir::LLVM::LLVMDialect>();
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(ctx),
"error in CUF GPU op conversion\n");
signalPassFailure();
}
}
};
} // namespace
void cuf::populateCUFGPUToLLVMConversionPatterns(
fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit) {
converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
});
patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
CUFStreamCastConversion>(converter, benefit);
}