lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp - llvm-project/flang - Git at Google

 //===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"

 namespace fir {
 #define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 } // namespace fir

 using namespace mlir;

 namespace {

 class CUFDeviceFuncTransform
     : public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
   using CUFDeviceFuncTransformBase<
       CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;

   static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
                                         bool isGlobal, int computeCap) {
     mlir::OpBuilder builder(funcOp.getContext());

     mlir::Region &funcOpBody = funcOp.getBody();
     SetVector<Value> operands;
     for (mlir::Value operand : funcOp.getArguments())
       operands.insert(operand);

     llvm::SmallVector<mlir::Type> funcOperandTypes;
     llvm::SmallVector<mlir::Type> funcResultTypes;
     funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
     funcResultTypes.reserve(funcOp.getResultTypes().size());
     for (mlir::Type opTy : funcOp.getArgumentTypes())
       funcOperandTypes.push_back(opTy);
     for (mlir::Type resTy : funcOp.getResultTypes())
       funcResultTypes.push_back(resTy);

     mlir::Location loc = funcOp.getLoc();

     mlir::FunctionType type = mlir::FunctionType::get(
         funcOp.getContext(), funcOperandTypes, funcResultTypes);

     auto deviceFuncOp =
         gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
                                mlir::TypeRange{}, mlir::TypeRange{});
     if (isGlobal)
       deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                             builder.getUnitAttr());

     mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
     mlir::Block &entryBlock = deviceFuncBody.front();

     mlir::IRMapping map;
     for (const auto &operand : enumerate(operands))
       map.map(operand.value(), entryBlock.getArgument(operand.index()));

     funcOpBody.cloneInto(&deviceFuncBody, map);

     deviceFuncOp.walk([](func::ReturnOp op) {
       mlir::OpBuilder replacer(op);
       gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
       gpuReturnOp->setOperands(op.getOperands());
       op.erase();
     });

     mlir::Block &funcOpEntry = funcOp.front();
     mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);

     entryBlock.getOperations().splice(entryBlock.getOperations().end(),
                                       clonedFuncOpEntry->getOperations());
     clonedFuncOpEntry->erase();

     auto launchBoundsAttr =
         funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
             cuf::getLaunchBoundsAttrName());
     if (launchBoundsAttr) {
       auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
       auto maxntid =
           builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
       deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
       deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
                             launchBoundsAttr.getMinBPM());
       if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
         deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
                               launchBoundsAttr.getUpperBoundClusterSize());
     }

     return deviceFuncOp;
   }

   static void createHostStub(mlir::func::FuncOp funcOp,
                              mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
     mlir::Location loc = funcOp.getLoc();
     mlir::OpBuilder modBuilder(mod.getBodyRegion());
     modBuilder.setInsertionPointToEnd(mod.getBody());
     auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
                                           funcOp.getFunctionType());
     emptyStub.setVisibility(funcOp.getVisibility());
     emptyStub->setAttrs(funcOp->getAttrs());
     auto entryBlock = emptyStub.addEntryBlock();
     modBuilder.setInsertionPointToEnd(entryBlock);
     func::ReturnOp::create(modBuilder, loc);

     symTab.erase(funcOp);
     symTab.insert(emptyStub);
   }

   static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
     if (auto cudaProcAttr =
             funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
                 cuf::getProcAttrName()))
       if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
           cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
           cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
           cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
         return true;
     return false;
   }

   void runOnOperation() override {
     // Working on Module operation because inserting/removing function from the
     // module is not thread-safe.
     ModuleOp mod = getOperation();
     mlir::SymbolTable symbolTable(getOperation());

     auto *ctx = getOperation().getContext();
     mlir::OpBuilder builder(ctx);

     gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
     mlir::SymbolTable gpuModSymTab(gpuMod);

     llvm::SetVector<mlir::func::FuncOp> funcsToClone;
     llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
     llvm::SetVector<mlir::func::FuncOp> keepInModule;
     llvm::StringSet<> deviceFuncNames;

     // Look for all function to migrate to the GPU module.
     mod.walk([&](mlir::func::FuncOp op) {
       if (isDeviceFunc(op)) {
         deviceFuncs.insert(op);
         deviceFuncNames.insert(op.getSymName());
       }
     });

     auto processCallOp = [&](fir::CallOp op) {
       if (op.getCallee()) {
         auto func = symbolTable.lookup<mlir::func::FuncOp>(
             op.getCallee()->getLeafReference());
         if (deviceFuncs.count(func) == 0)
           funcsToClone.insert(func);
       }
     };

     // Gather all function called by device functions.
     for (auto funcOp : deviceFuncs) {
       funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
       funcOp.walk([&](fir::DispatchOp op) {
         TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
                           "in device procedure");
       });
     }

     // Functions that are referenced in a derived-type binding table must be
     // kept in the host module to avoid LLVM dialect verification errors.
     for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
       if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
         globalOp.walk([&](fir::AddrOfOp addrOfOp) {
           if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
             keepInModule.insert(
                 *llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
                   return f.getSymName() ==
                          addrOfOp.getSymbol().getLeafReference();
                 }));
         });
       }
     }

     // Gather all functions called by CUF kernels.
     mod.walk([&](cuf::KernelOp kernelOp) {
       kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
       kernelOp.walk([&](fir::DispatchOp op) {
         TODO(op.getLoc(),
              "type-bound procedure call with dynamic dispatch in cuf kernel");
       });
     });

     for (auto funcOp : funcsToClone)
       gpuModSymTab.insert(funcOp->clone());

     for (auto funcOp : deviceFuncs) {
       auto cudaProcAttr =
           funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
               cuf::getProcAttrName());
       auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
                       cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
       if (funcOp.isDeclaration()) {
         mlir::Operation *clonedFuncOp = funcOp->clone();
         if (isGlobal) {
           clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                                 builder.getUnitAttr());
           clonedFuncOp->removeAttr(cuf::getProcAttrName());
         }
         gpuModSymTab.insert(clonedFuncOp);
       } else {
         gpu::GPUFuncOp deviceFuncOp =
             createGPUFuncOp(funcOp, isGlobal, computeCap);
         gpuModSymTab.insert(deviceFuncOp);

         if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
           // If the function is a global, we need to keep the host side
           // declaration for the kernel registration. Currently we just
           // erase its body but in the future, the body should be rewritten
           // to be able to launch CUDA Fortran kernel from C code.
           if (isGlobal || keepInModule.contains(funcOp))
             createHostStub(funcOp, symbolTable, mod);
           else
             funcOp.erase();
         }
       }
     }
   }
 };

 } // end anonymous namespace
	//===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "flang/Optimizer/Builder/CUFCommon.h"
	#include "flang/Optimizer/Builder/Todo.h"
	#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
	#include "flang/Optimizer/Dialect/FIRAttr.h"
	#include "flang/Optimizer/Dialect/FIRDialect.h"
	#include "flang/Optimizer/Dialect/FIROpsSupport.h"
	#include "flang/Optimizer/Support/InternalNames.h"
	#include "flang/Optimizer/Transforms/Passes.h"
	#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/Index/IR/IndexDialect.h"
	#include "mlir/Dialect/Index/IR/IndexOps.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
	#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
	#include "mlir/Dialect/SCF/IR/SCF.h"
	#include "mlir/IR/IRMapping.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Transforms/RegionUtils.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/StringSet.h"

	namespace fir {
	#define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
	#include "flang/Optimizer/Transforms/Passes.h.inc"
	} // namespace fir

	using namespace mlir;

	namespace {

	class CUFDeviceFuncTransform
	: public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
	using CUFDeviceFuncTransformBase<
	CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;

	static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
	bool isGlobal, int computeCap) {
	mlir::OpBuilder builder(funcOp.getContext());

	mlir::Region &funcOpBody = funcOp.getBody();
	SetVector<Value> operands;
	for (mlir::Value operand : funcOp.getArguments())
	operands.insert(operand);

	llvm::SmallVector<mlir::Type> funcOperandTypes;
	llvm::SmallVector<mlir::Type> funcResultTypes;
	funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
	funcResultTypes.reserve(funcOp.getResultTypes().size());
	for (mlir::Type opTy : funcOp.getArgumentTypes())
	funcOperandTypes.push_back(opTy);
	for (mlir::Type resTy : funcOp.getResultTypes())
	funcResultTypes.push_back(resTy);

	mlir::Location loc = funcOp.getLoc();

	mlir::FunctionType type = mlir::FunctionType::get(
	funcOp.getContext(), funcOperandTypes, funcResultTypes);

	auto deviceFuncOp =
	gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
	mlir::TypeRange{}, mlir::TypeRange{});
	if (isGlobal)
	deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
	builder.getUnitAttr());

	mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
	mlir::Block &entryBlock = deviceFuncBody.front();

	mlir::IRMapping map;
	for (const auto &operand : enumerate(operands))
	map.map(operand.value(), entryBlock.getArgument(operand.index()));

	funcOpBody.cloneInto(&deviceFuncBody, map);

	deviceFuncOp.walk([](func::ReturnOp op) {
	mlir::OpBuilder replacer(op);
	gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
	gpuReturnOp->setOperands(op.getOperands());
	op.erase();
	});

	mlir::Block &funcOpEntry = funcOp.front();
	mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);

	entryBlock.getOperations().splice(entryBlock.getOperations().end(),
	clonedFuncOpEntry->getOperations());
	clonedFuncOpEntry->erase();

	auto launchBoundsAttr =
	funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
	cuf::getLaunchBoundsAttrName());
	if (launchBoundsAttr) {
	auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
	auto maxntid =
	builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
	deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
	deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
	launchBoundsAttr.getMinBPM());
	if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
	deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
	launchBoundsAttr.getUpperBoundClusterSize());
	}

	return deviceFuncOp;
	}

	static void createHostStub(mlir::func::FuncOp funcOp,
	mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
	mlir::Location loc = funcOp.getLoc();
	mlir::OpBuilder modBuilder(mod.getBodyRegion());
	modBuilder.setInsertionPointToEnd(mod.getBody());
	auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
	funcOp.getFunctionType());
	emptyStub.setVisibility(funcOp.getVisibility());
	emptyStub->setAttrs(funcOp->getAttrs());
	auto entryBlock = emptyStub.addEntryBlock();
	modBuilder.setInsertionPointToEnd(entryBlock);
	func::ReturnOp::create(modBuilder, loc);

	symTab.erase(funcOp);
	symTab.insert(emptyStub);
	}

	static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
	if (auto cudaProcAttr =
	funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
	cuf::getProcAttrName()))
	if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device \|\|
	cudaProcAttr.getValue() == cuf::ProcAttribute::Global \|\|
	cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal \|\|
	cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
	return true;
	return false;
	}

	void runOnOperation() override {
	// Working on Module operation because inserting/removing function from the
	// module is not thread-safe.
	ModuleOp mod = getOperation();
	mlir::SymbolTable symbolTable(getOperation());

	auto *ctx = getOperation().getContext();
	mlir::OpBuilder builder(ctx);

	gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
	mlir::SymbolTable gpuModSymTab(gpuMod);

	llvm::SetVector<mlir::func::FuncOp> funcsToClone;
	llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
	llvm::SetVector<mlir::func::FuncOp> keepInModule;
	llvm::StringSet<> deviceFuncNames;

	// Look for all function to migrate to the GPU module.
	mod.walk([&](mlir::func::FuncOp op) {
	if (isDeviceFunc(op)) {
	deviceFuncs.insert(op);
	deviceFuncNames.insert(op.getSymName());
	}
	});

	auto processCallOp = [&](fir::CallOp op) {
	if (op.getCallee()) {
	auto func = symbolTable.lookup<mlir::func::FuncOp>(
	op.getCallee()->getLeafReference());
	if (deviceFuncs.count(func) == 0)
	funcsToClone.insert(func);
	}
	};

	// Gather all function called by device functions.
	for (auto funcOp : deviceFuncs) {
	funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
	funcOp.walk([&](fir::DispatchOp op) {
	TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
	"in device procedure");
	});
	}

	// Functions that are referenced in a derived-type binding table must be
	// kept in the host module to avoid LLVM dialect verification errors.
	for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
	if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
	globalOp.walk([&](fir::AddrOfOp addrOfOp) {
	if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
	keepInModule.insert(
	*llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
	return f.getSymName() ==
	addrOfOp.getSymbol().getLeafReference();
	}));
	});
	}
	}

	// Gather all functions called by CUF kernels.
	mod.walk([&](cuf::KernelOp kernelOp) {
	kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
	kernelOp.walk([&](fir::DispatchOp op) {
	TODO(op.getLoc(),
	"type-bound procedure call with dynamic dispatch in cuf kernel");
	});
	});

	for (auto funcOp : funcsToClone)
	gpuModSymTab.insert(funcOp->clone());

	for (auto funcOp : deviceFuncs) {
	auto cudaProcAttr =
	funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
	cuf::getProcAttrName());
	auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global \|\|
	cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
	if (funcOp.isDeclaration()) {
	mlir::Operation *clonedFuncOp = funcOp->clone();
	if (isGlobal) {
	clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
	builder.getUnitAttr());
	clonedFuncOp->removeAttr(cuf::getProcAttrName());
	}
	gpuModSymTab.insert(clonedFuncOp);
	} else {
	gpu::GPUFuncOp deviceFuncOp =
	createGPUFuncOp(funcOp, isGlobal, computeCap);
	gpuModSymTab.insert(deviceFuncOp);

	if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
	// If the function is a global, we need to keep the host side
	// declaration for the kernel registration. Currently we just
	// erase its body but in the future, the body should be rewritten
	// to be able to launch CUDA Fortran kernel from C code.
	if (isGlobal \|\| keepInModule.contains(funcOp))
	createHostStub(funcOp, symbolTable, mod);
	else
	funcOp.erase();
	}
	}
	}
	}
	};

	} // end anonymous namespace