[flang][cuda] Do inline allocation/deallocation in device code (#106628)
ALLOCATE and DEALLOCATE statements can be inlined in device function.
This patch updates the condition that determined to inline these actions
in lowering.
This avoid runtime calls in device function code and can speed up the
execution.
Also move `isCudaDeviceContext` from `Bridge.cpp` so it can be used
elsewhere.
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index d4d999f..fb8380a 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -15,6 +15,7 @@
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
+#include "flang/Lower/Cuda.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
@@ -453,16 +454,22 @@
void genSimpleAllocation(const Allocation &alloc,
const fir::MutableBoxValue &box) {
- if (!box.isDerived() && !errorManager.hasStatSpec() &&
- !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
- !useAllocateRuntime && !box.isPointer() &&
- !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
+ bool isCudaSymbol = Fortran::semantics::HasCUDAAttr(alloc.getSymbol());
+ bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool inlineAllocation = !box.isDerived() && !errorManager.hasStatSpec() &&
+ !alloc.type.IsPolymorphic() &&
+ !alloc.hasCoarraySpec() && !useAllocateRuntime &&
+ !box.isPointer();
+
+ if (inlineAllocation &&
+ ((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
postAllocationAction(alloc);
return;
}
+
// Generate a sequence of runtime calls.
errorManager.genStatCheck(builder, loc);
genAllocateObjectInit(box);
@@ -473,7 +480,7 @@
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
- if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ if (!isCudaSymbol)
stat = genRuntimeAllocate(builder, loc, box, errorManager);
else
stat =
@@ -830,10 +837,14 @@
mlir::Value declaredTypeDesc = {},
const Fortran::semantics::Symbol *symbol = nullptr) {
bool isCudaSymbol = symbol && Fortran::semantics::HasCUDAAttr(*symbol);
- // Deallocate intrinsic types inline.
- if (!box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
+ bool isCudaDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
+ bool inlineDeallocation =
+ !box.isDerived() && !box.isPolymorphic() && !box.hasAssumedRank() &&
!box.isUnlimitedPolymorphic() && !errorManager.hasStatSpec() &&
- !useAllocateRuntime && !box.isPointer() && !isCudaSymbol) {
+ !useAllocateRuntime && !box.isPointer();
+ // Deallocate intrinsic types inline.
+ if (inlineDeallocation &&
+ ((isCudaSymbol && isCudaDeviceContext) || !isCudaSymbol)) {
// Pointers must use PointerDeallocate so that their deallocations
// can be validated.
mlir::Value ret = fir::factory::genFreemem(builder, loc, box);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 078e17b..90943fa 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -20,6 +20,7 @@
#include "flang/Lower/ConvertExprToHLFIR.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
+#include "flang/Lower/Cuda.h"
#include "flang/Lower/HostAssociations.h"
#include "flang/Lower/IO.h"
#include "flang/Lower/IterationSpace.h"
@@ -4377,36 +4378,13 @@
return temps;
}
- // Check if the insertion point is currently in a device context. HostDevice
- // subprogram are not considered fully device context so it will return false
- // for it.
- // If the insertion point is inside an OpenACC region op, it is considered
- // device context.
- static bool isCudaDeviceContext(fir::FirOpBuilder &builder) {
- if (builder.getRegion().getParentOfType<cuf::KernelOp>())
- return true;
- if (builder.getRegion()
- .getParentOfType<mlir::acc::ComputeRegionOpInterface>())
- return true;
- if (auto funcOp =
- builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
- if (auto cudaProcAttr =
- funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
- cuf::getProcAttrName())) {
- return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
- cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
- }
- }
- return false;
- }
-
void genDataAssignment(
const Fortran::evaluate::Assignment &assign,
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- bool isInDeviceContext = isCudaDeviceContext(builder);
+ bool isInDeviceContext = Fortran::lower::isCudaDeviceContext(builder);
bool isCUDATransfer = (Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs) ||
Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs)) &&