| //===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "flang/Optimizer/Builder/BoxValue.h" |
| #include "flang/Optimizer/Builder/CUFCommon.h" |
| #include "flang/Optimizer/Builder/FIRBuilder.h" |
| #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" |
| #include "flang/Optimizer/Builder/Todo.h" |
| #include "flang/Optimizer/CodeGen/Target.h" |
| #include "flang/Optimizer/CodeGen/TypeConverter.h" |
| #include "flang/Optimizer/Dialect/CUF/CUFOps.h" |
| #include "flang/Optimizer/Dialect/FIRAttr.h" |
| #include "flang/Optimizer/Dialect/FIRDialect.h" |
| #include "flang/Optimizer/Dialect/FIROps.h" |
| #include "flang/Optimizer/Dialect/FIROpsSupport.h" |
| #include "flang/Optimizer/Dialect/FIRType.h" |
| #include "flang/Optimizer/Support/DataLayout.h" |
| #include "flang/Runtime/CUDA/registration.h" |
| #include "flang/Runtime/entry-names.h" |
| #include "mlir/Dialect/DLTI/DLTI.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| #include "mlir/IR/Value.h" |
| #include "mlir/Pass/Pass.h" |
| #include "llvm/ADT/SmallVector.h" |
| |
| namespace fir { |
| #define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE |
| #include "flang/Optimizer/Transforms/Passes.h.inc" |
| } // namespace fir |
| |
| using namespace Fortran::runtime::cuda; |
| |
| namespace { |
| |
| static bool isAssumedSize(mlir::ValueRange shape) { |
| if (shape.size() != 1) |
| return false; |
| if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(shape[0].getDefiningOp())) |
| return true; |
| return false; |
| } |
| |
| static void createSharedMemoryGlobal(fir::FirOpBuilder &builder, |
| mlir::Location loc, llvm::StringRef prefix, |
| llvm::StringRef suffix, |
| mlir::gpu::GPUModuleOp gpuMod, |
| mlir::Type sharedMemType, unsigned size, |
| unsigned align, bool isDynamic) { |
| std::string sharedMemGlobalName = |
| isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str() |
| : (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str(); |
| |
| mlir::OpBuilder::InsertionGuard guard(builder); |
| builder.setInsertionPointToEnd(gpuMod.getBody()); |
| |
| mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage() |
| : builder.createInternalLinkage(); |
| llvm::SmallVector<mlir::NamedAttribute> attrs; |
| auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), |
| gpuMod.getContext()); |
| attrs.push_back(mlir::NamedAttribute( |
| fir::GlobalOp::getDataAttrAttrName(globalOpName), |
| cuf::DataAttributeAttr::get(gpuMod.getContext(), |
| cuf::DataAttribute::Shared))); |
| |
| mlir::DenseElementsAttr init = {}; |
| mlir::Type i8Ty = builder.getI8Type(); |
| if (size > 0) { |
| auto vecTy = mlir::VectorType::get( |
| static_cast<fir::SequenceType::Extent>(size), i8Ty); |
| mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); |
| init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); |
| } |
| auto sharedMem = |
| fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false, |
| sharedMemType, init, linkage, attrs); |
| sharedMem.setAlignment(align); |
| } |
| |
| struct CUFComputeSharedMemoryOffsetsAndSize |
| : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase< |
| CUFComputeSharedMemoryOffsetsAndSize> { |
| |
| void runOnOperation() override { |
| mlir::ModuleOp mod = getOperation(); |
| mlir::SymbolTable symTab(mod); |
| mlir::OpBuilder opBuilder{mod.getBodyRegion()}; |
| fir::FirOpBuilder builder(opBuilder, mod); |
| fir::KindMapping kindMap{fir::getKindMapping(mod)}; |
| std::optional<mlir::DataLayout> dl = |
| fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false); |
| if (!dl) { |
| mlir::emitError(mod.getLoc(), |
| "data layout attribute is required to perform " + |
| getName() + "pass"); |
| } |
| |
| auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab); |
| mlir::Type i8Ty = builder.getI8Type(); |
| mlir::Type i32Ty = builder.getI32Type(); |
| mlir::Type idxTy = builder.getIndexType(); |
| for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) { |
| unsigned nbDynamicSharedVariables = 0; |
| unsigned nbStaticSharedVariables = 0; |
| uint64_t sharedMemSize = 0; |
| unsigned short alignment = 0; |
| mlir::Value crtDynOffset; |
| |
| // Go over each shared memory operation and compute their start offset and |
| // the size and alignment of the global to be generated if all variables |
| // are static. If this is dynamic shared memory, then only the alignment |
| // is computed. |
| for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) { |
| mlir::Location loc = sharedOp.getLoc(); |
| builder.setInsertionPoint(sharedOp); |
| if (fir::hasDynamicSize(sharedOp.getInType())) { |
| mlir::Type ty = sharedOp.getInType(); |
| if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) |
| ty = seqTy.getEleTy(); |
| unsigned short align = dl->getTypeABIAlignment(ty); |
| alignment = std::max(alignment, align); |
| uint64_t tySize = dl->getTypeSize(ty); |
| ++nbDynamicSharedVariables; |
| if (isAssumedSize(sharedOp.getShape()) || !crtDynOffset) { |
| mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); |
| sharedOp.getOffsetMutable().assign(zero); |
| } else { |
| sharedOp.getOffsetMutable().assign( |
| builder.createConvert(loc, i32Ty, crtDynOffset)); |
| } |
| |
| mlir::Value dynSize = |
| builder.createIntegerConstant(loc, idxTy, tySize); |
| for (auto extent : sharedOp.getShape()) |
| dynSize = |
| mlir::arith::MulIOp::create(builder, loc, dynSize, extent); |
| if (crtDynOffset) |
| crtDynOffset = mlir::arith::AddIOp::create(builder, loc, |
| crtDynOffset, dynSize); |
| else |
| crtDynOffset = dynSize; |
| } else { |
| // Static shared memory. |
| auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( |
| loc, sharedOp.getInType(), *dl, kindMap); |
| createSharedMemoryGlobal( |
| builder, sharedOp.getLoc(), funcOp.getName(), |
| *sharedOp.getBindcName(), gpuMod, |
| fir::SequenceType::get(size, i8Ty), size, |
| sharedOp.getAlignment() ? *sharedOp.getAlignment() : align, |
| /*isDynamic=*/false); |
| mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); |
| sharedOp.getOffsetMutable().assign(zero); |
| if (!sharedOp.getAlignment()) |
| sharedOp.setAlignment(align); |
| sharedOp.setIsStatic(true); |
| ++nbStaticSharedVariables; |
| } |
| } |
| |
| if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0) |
| continue; |
| |
| if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0) |
| mlir::emitError( |
| funcOp.getLoc(), |
| "static and dynamic shared variables in a single kernel"); |
| |
| if (nbStaticSharedVariables > 0) |
| continue; |
| |
| auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); |
| createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "", |
| gpuMod, sharedMemType, sharedMemSize, alignment, |
| /*isDynamic=*/true); |
| } |
| } |
| }; |
| |
| } // end anonymous namespace |