blob: 7bae0602fe5cab1b6a2aba79148d181309730790 [file] [log] [blame]
//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/CodeGen/Target.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/SmallVector.h"
namespace fir {
#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir
using namespace Fortran::runtime::cuda;
namespace {
static bool isAssumedSize(mlir::ValueRange shape) {
if (shape.size() != 1)
return false;
if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(shape[0].getDefiningOp()))
return true;
return false;
}
static void createSharedMemoryGlobal(fir::FirOpBuilder &builder,
mlir::Location loc, llvm::StringRef prefix,
llvm::StringRef suffix,
mlir::gpu::GPUModuleOp gpuMod,
mlir::Type sharedMemType, unsigned size,
unsigned align, bool isDynamic) {
std::string sharedMemGlobalName =
isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str()
: (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str();
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToEnd(gpuMod.getBody());
mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage()
: builder.createInternalLinkage();
llvm::SmallVector<mlir::NamedAttribute> attrs;
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
gpuMod.getContext());
attrs.push_back(mlir::NamedAttribute(
fir::GlobalOp::getDataAttrAttrName(globalOpName),
cuf::DataAttributeAttr::get(gpuMod.getContext(),
cuf::DataAttribute::Shared)));
mlir::DenseElementsAttr init = {};
mlir::Type i8Ty = builder.getI8Type();
if (size > 0) {
auto vecTy = mlir::VectorType::get(
static_cast<fir::SequenceType::Extent>(size), i8Ty);
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
}
auto sharedMem =
fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false,
sharedMemType, init, linkage, attrs);
sharedMem.setAlignment(align);
}
struct CUFComputeSharedMemoryOffsetsAndSize
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
CUFComputeSharedMemoryOffsetsAndSize> {
void runOnOperation() override {
mlir::ModuleOp mod = getOperation();
mlir::SymbolTable symTab(mod);
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
fir::FirOpBuilder builder(opBuilder, mod);
fir::KindMapping kindMap{fir::getKindMapping(mod)};
std::optional<mlir::DataLayout> dl =
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
if (!dl) {
mlir::emitError(mod.getLoc(),
"data layout attribute is required to perform " +
getName() + "pass");
}
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
mlir::Type i8Ty = builder.getI8Type();
mlir::Type i32Ty = builder.getI32Type();
mlir::Type idxTy = builder.getIndexType();
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
unsigned nbDynamicSharedVariables = 0;
unsigned nbStaticSharedVariables = 0;
uint64_t sharedMemSize = 0;
unsigned short alignment = 0;
mlir::Value crtDynOffset;
// Go over each shared memory operation and compute their start offset and
// the size and alignment of the global to be generated if all variables
// are static. If this is dynamic shared memory, then only the alignment
// is computed.
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
mlir::Location loc = sharedOp.getLoc();
builder.setInsertionPoint(sharedOp);
if (fir::hasDynamicSize(sharedOp.getInType())) {
mlir::Type ty = sharedOp.getInType();
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
ty = seqTy.getEleTy();
unsigned short align = dl->getTypeABIAlignment(ty);
alignment = std::max(alignment, align);
uint64_t tySize = dl->getTypeSize(ty);
++nbDynamicSharedVariables;
if (isAssumedSize(sharedOp.getShape()) || !crtDynOffset) {
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
sharedOp.getOffsetMutable().assign(zero);
} else {
sharedOp.getOffsetMutable().assign(
builder.createConvert(loc, i32Ty, crtDynOffset));
}
mlir::Value dynSize =
builder.createIntegerConstant(loc, idxTy, tySize);
for (auto extent : sharedOp.getShape())
dynSize =
mlir::arith::MulIOp::create(builder, loc, dynSize, extent);
if (crtDynOffset)
crtDynOffset = mlir::arith::AddIOp::create(builder, loc,
crtDynOffset, dynSize);
else
crtDynOffset = dynSize;
} else {
// Static shared memory.
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
loc, sharedOp.getInType(), *dl, kindMap);
createSharedMemoryGlobal(
builder, sharedOp.getLoc(), funcOp.getName(),
*sharedOp.getBindcName(), gpuMod,
fir::SequenceType::get(size, i8Ty), size,
sharedOp.getAlignment() ? *sharedOp.getAlignment() : align,
/*isDynamic=*/false);
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
sharedOp.getOffsetMutable().assign(zero);
if (!sharedOp.getAlignment())
sharedOp.setAlignment(align);
sharedOp.setIsStatic(true);
++nbStaticSharedVariables;
}
}
if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
continue;
if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
mlir::emitError(
funcOp.getLoc(),
"static and dynamic shared variables in a single kernel");
if (nbStaticSharedVariables > 0)
continue;
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "",
gpuMod, sharedMemType, sharedMemSize, alignment,
/*isDynamic=*/true);
}
}
};
} // end anonymous namespace