[Libomptarget] Fix RPC-based malloc on NVPTX (#72440) Summary: The device allocator on NVPTX architectures is enqueued to a stream that the kernel is potentially executing on. This can lead to deadlocks as the kernel will not proceed until the allocation is complete and the allocation will not proceed until the kernel is complete. CUDA 11.2 introduced async allocations that we can manually place on separate streams to combat this. This patch makes a new allocation type that's guaranteed to be non-blocking so it will actually make progress, only Nvidia needs to care about this as the others are not blocking in this way by default. I had originally tried to make the `alloc` and `free` methods take a `__tgt_async_info`. However, I observed that with the large volume of streams being created by a parallel test it quickly locked up the system as presumably too many streams were being created. This implementation not just creates a new stream and immediately destroys it. This obviously isn't very fast, but it at least gets the cases to stop deadlocking for now. GitOrigin-RevId: fb32977ac768f27890af28308a6968c30af2aa3e

commit: 234ff6b4012919fab48dd47492dfd9bb8bc55eed [log] [tgz]
author: Joseph Huber <huberjn@outlook.com> Tue Jan 02 16:53:53 2024 -0600
committer: Copybara-Service <copybara-worker@google.com> Tue Jan 02 14:55:45 2024 -0800
tree: c06c1748cae5aaa11033ebb9fda78c75f096e8c9
parent: 74b1a4db3f1e5da47f0bb6b3d9a9699939a73643 [diff]
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index 476a158..d5602ee 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h

@@ -103,7 +103,9 @@
   TARGET_ALLOC_DEVICE = 0,
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
-  TARGET_ALLOC_DEFAULT
+  TARGET_ALLOC_DEFAULT,
+  /// The allocation will not block on other streams.
+  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,

diff --git a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fe435a3..0411c67 100644
--- a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

@@ -2112,6 +2112,7 @@
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:

diff --git a/libomptarget/plugins-nextgen/common/src/RPC.cpp b/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 60e0540..54aced1 100644
--- a/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/libomptarget/plugins-nextgen/common/src/RPC.cpp

@@ -62,15 +62,14 @@
         "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
 
   // Register a custom opcode handler to perform plugin specific allocation.
-  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
   auto MallocHandler = [](rpc_port_t Port, void *Data) {
     rpc_recv_and_send(
         Port,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(
-              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
         },
         Data);
   };
@@ -88,7 +87,7 @@
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
           Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE);
+                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
         },
         Data);
   };

diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 56c4404..5ec3adb 100644
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

@@ -43,6 +43,7 @@
 DLWRAP(cuMemAlloc, 2)
 DLWRAP(cuMemAllocHost, 2)
 DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
 
 DLWRAP(cuMemcpyDtoDAsync, 4)
 DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@
 
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
 DLWRAP(cuModuleGetFunction, 3)
 DLWRAP(cuModuleGetGlobal, 4)
 

diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e03077..32031c2 100644
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h

@@ -293,6 +293,7 @@
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
 CUresult cuMemAllocHost(void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
 
 CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@
 
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
 
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
 CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);

diff --git a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index b0dff91..0005bff 100644
--- a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

@@ -63,6 +63,14 @@
                               CUmemAllocationGranularity_flags option) {}
 #endif
 
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+        break;
+      cuStreamSynchronize(Stream);
+      Res = cuStreamDestroy(Stream);
+      MemAlloc = (void *)DevicePtr;
+    }
     }
 
     if (auto Err =
@@ -518,6 +536,15 @@
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+      cuStreamSynchronize(Stream);
+      if ((Res = cuStreamDestroy(Stream)))
+        break;
+    }
     }
 
     if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {

diff --git a/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 88b5236..43569f2 100644
--- a/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp

@@ -215,6 +215,7 @@
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }

diff --git a/libomptarget/test/libc/malloc.c b/libomptarget/test/libc/malloc.c
index c18a724..b587b61 100644
--- a/libomptarget/test/libc/malloc.c
+++ b/libomptarget/test/libc/malloc.c

@@ -13,7 +13,7 @@
   unsigned *d_x;
 #pragma omp target map(from : d_x)
   {
-    d_x = malloc(sizeof(unsigned));
+    d_x = (unsigned *)malloc(sizeof(unsigned));
     *d_x = 1;
   }
 
@@ -23,6 +23,14 @@
 #pragma omp target is_device_ptr(d_x)
   { free(d_x); }
 
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+  {
+    int *ptr = (int *)malloc(sizeof(int));
+    *ptr = 42;
+    free(ptr);
+  }
+
   // CHECK: PASS
   if (h_x == 1)
     fputs("PASS\n", stdout);
commit	234ff6b4012919fab48dd47492dfd9bb8bc55eed	[log] [tgz]
author	Joseph Huber <huberjn@outlook.com>	Tue Jan 02 16:53:53 2024 -0600
committer	Copybara-Service <copybara-worker@google.com>	Tue Jan 02 14:55:45 2024 -0800
tree	c06c1748cae5aaa11033ebb9fda78c75f096e8c9
parent	74b1a4db3f1e5da47f0bb6b3d9a9699939a73643 [diff]