flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp - llvm-project - Git at Google

 //===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/CodeGen/Target.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/DataLayout.h"
 #include "flang/Runtime/CUDA/registration.h"
 #include "flang/Runtime/entry-names.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/SmallVector.h"

 namespace fir {
 #define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 } // namespace fir

 using namespace Fortran::runtime::cuda;

 namespace {

 struct CUFComputeSharedMemoryOffsetsAndSize
     : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
           CUFComputeSharedMemoryOffsetsAndSize> {

   void runOnOperation() override {
     mlir::ModuleOp mod = getOperation();
     mlir::SymbolTable symTab(mod);
     mlir::OpBuilder opBuilder{mod.getBodyRegion()};
     fir::FirOpBuilder builder(opBuilder, mod);
     fir::KindMapping kindMap{fir::getKindMapping(mod)};
     std::optional<mlir::DataLayout> dl =
         fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
     if (!dl) {
       mlir::emitError(mod.getLoc(),
                       "data layout attribute is required to perform " +
                           getName() + "pass");
     }

     auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
     mlir::Type i8Ty = builder.getI8Type();
     mlir::Type i32Ty = builder.getI32Type();
     mlir::Type idxTy = builder.getIndexType();
     for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
       unsigned nbDynamicSharedVariables = 0;
       unsigned nbStaticSharedVariables = 0;
       uint64_t sharedMemSize = 0;
       unsigned short alignment = 0;
       mlir::Value crtDynOffset;

       // Go over each shared memory operation and compute their start offset and
       // the size and alignment of the global to be generated if all variables
       // are static. If this is dynamic shared memory, then only the alignment
       // is computed.
       for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
         mlir::Location loc = sharedOp.getLoc();
         builder.setInsertionPoint(sharedOp);
         if (fir::hasDynamicSize(sharedOp.getInType())) {
           mlir::Type ty = sharedOp.getInType();
           if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
             ty = seqTy.getEleTy();
           unsigned short align = dl->getTypeABIAlignment(ty);
           alignment = std::max(alignment, align);
           uint64_t tySize = dl->getTypeSize(ty);
           ++nbDynamicSharedVariables;
           if (crtDynOffset) {
             sharedOp.getOffsetMutable().assign(
                 builder.createConvert(loc, i32Ty, crtDynOffset));
           } else {
             mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
             sharedOp.getOffsetMutable().assign(zero);
           }

           mlir::Value dynSize =
               builder.createIntegerConstant(loc, idxTy, tySize);
           for (auto extent : sharedOp.getShape())
             dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
           if (crtDynOffset)
             crtDynOffset =
                 builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
           else
             crtDynOffset = dynSize;

           continue;
         }
         auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
             sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
         ++nbStaticSharedVariables;
         mlir::Value offset = builder.createIntegerConstant(
             loc, i32Ty, llvm::alignTo(sharedMemSize, align));
         sharedOp.getOffsetMutable().assign(offset);
         sharedMemSize =
             llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
         alignment = std::max(alignment, align);
       }

       if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
         continue;

       if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
         mlir::emitError(
             funcOp.getLoc(),
             "static and dynamic shared variables in a single kernel");

       mlir::DenseElementsAttr init = {};
       if (sharedMemSize > 0) {
         auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
         mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
         init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
       }

       // Create the shared memory global where each shared variable will point
       // to.
       auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
       std::string sharedMemGlobalName =
           (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
       mlir::StringAttr linkage = builder.createInternalLinkage();
       builder.setInsertionPointToEnd(gpuMod.getBody());
       llvm::SmallVector<mlir::NamedAttribute> attrs;
       auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
                                               gpuMod.getContext());
       attrs.push_back(mlir::NamedAttribute(
           fir::GlobalOp::getDataAttrAttrName(globalOpName),
           cuf::DataAttributeAttr::get(gpuMod.getContext(),
                                       cuf::DataAttribute::Shared)));
       auto sharedMem = builder.create<fir::GlobalOp>(
           funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
           init, linkage, attrs);
       sharedMem.setAlignment(alignment);
     }
   }
 };

 } // end anonymous namespace
	//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "flang/Optimizer/Builder/BoxValue.h"
	#include "flang/Optimizer/Builder/CUFCommon.h"
	#include "flang/Optimizer/Builder/FIRBuilder.h"
	#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
	#include "flang/Optimizer/Builder/Todo.h"
	#include "flang/Optimizer/CodeGen/Target.h"
	#include "flang/Optimizer/CodeGen/TypeConverter.h"
	#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
	#include "flang/Optimizer/Dialect/FIRAttr.h"
	#include "flang/Optimizer/Dialect/FIRDialect.h"
	#include "flang/Optimizer/Dialect/FIROps.h"
	#include "flang/Optimizer/Dialect/FIROpsSupport.h"
	#include "flang/Optimizer/Dialect/FIRType.h"
	#include "flang/Optimizer/Support/DataLayout.h"
	#include "flang/Runtime/CUDA/registration.h"
	#include "flang/Runtime/entry-names.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
	#include "mlir/IR/Value.h"
	#include "mlir/Pass/Pass.h"
	#include "llvm/ADT/SmallVector.h"

	namespace fir {
	#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
	#include "flang/Optimizer/Transforms/Passes.h.inc"
	} // namespace fir

	using namespace Fortran::runtime::cuda;

	namespace {

	struct CUFComputeSharedMemoryOffsetsAndSize
	: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
	CUFComputeSharedMemoryOffsetsAndSize> {

	void runOnOperation() override {
	mlir::ModuleOp mod = getOperation();
	mlir::SymbolTable symTab(mod);
	mlir::OpBuilder opBuilder{mod.getBodyRegion()};
	fir::FirOpBuilder builder(opBuilder, mod);
	fir::KindMapping kindMap{fir::getKindMapping(mod)};
	std::optional<mlir::DataLayout> dl =
	fir::support::getOrSetMLIRDataLayout(mod, /allowDefaultLayout=/false);
	if (!dl) {
	mlir::emitError(mod.getLoc(),
	"data layout attribute is required to perform " +
	getName() + "pass");
	}

	auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
	mlir::Type i8Ty = builder.getI8Type();
	mlir::Type i32Ty = builder.getI32Type();
	mlir::Type idxTy = builder.getIndexType();
	for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
	unsigned nbDynamicSharedVariables = 0;
	unsigned nbStaticSharedVariables = 0;
	uint64_t sharedMemSize = 0;
	unsigned short alignment = 0;
	mlir::Value crtDynOffset;

	// Go over each shared memory operation and compute their start offset and
	// the size and alignment of the global to be generated if all variables
	// are static. If this is dynamic shared memory, then only the alignment
	// is computed.
	for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
	mlir::Location loc = sharedOp.getLoc();
	builder.setInsertionPoint(sharedOp);
	if (fir::hasDynamicSize(sharedOp.getInType())) {
	mlir::Type ty = sharedOp.getInType();
	if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
	ty = seqTy.getEleTy();
	unsigned short align = dl->getTypeABIAlignment(ty);
	alignment = std::max(alignment, align);
	uint64_t tySize = dl->getTypeSize(ty);
	++nbDynamicSharedVariables;
	if (crtDynOffset) {
	sharedOp.getOffsetMutable().assign(
	builder.createConvert(loc, i32Ty, crtDynOffset));
	} else {
	mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
	sharedOp.getOffsetMutable().assign(zero);
	}

	mlir::Value dynSize =
	builder.createIntegerConstant(loc, idxTy, tySize);
	for (auto extent : sharedOp.getShape())
	dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
	if (crtDynOffset)
	crtDynOffset =
	builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
	else
	crtDynOffset = dynSize;

	continue;
	}
	auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
	sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
	++nbStaticSharedVariables;
	mlir::Value offset = builder.createIntegerConstant(
	loc, i32Ty, llvm::alignTo(sharedMemSize, align));
	sharedOp.getOffsetMutable().assign(offset);
	sharedMemSize =
	llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
	alignment = std::max(alignment, align);
	}

	if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
	continue;

	if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
	mlir::emitError(
	funcOp.getLoc(),
	"static and dynamic shared variables in a single kernel");

	mlir::DenseElementsAttr init = {};
	if (sharedMemSize > 0) {
	auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
	mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
	init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
	}

	// Create the shared memory global where each shared variable will point
	// to.
	auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
	std::string sharedMemGlobalName =
	(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
	mlir::StringAttr linkage = builder.createInternalLinkage();
	builder.setInsertionPointToEnd(gpuMod.getBody());
	llvm::SmallVector<mlir::NamedAttribute> attrs;
	auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
	gpuMod.getContext());
	attrs.push_back(mlir::NamedAttribute(
	fir::GlobalOp::getDataAttrAttrName(globalOpName),
	cuf::DataAttributeAttr::get(gpuMod.getContext(),
	cuf::DataAttribute::Shared)));
	auto sharedMem = builder.create<fir::GlobalOp>(
	funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
	init, linkage, attrs);
	sharedMem.setAlignment(alignment);
	}
	}
	};

	} // end anonymous namespace