[Libomptarget] Fix RPC-based malloc on NVPTX  (#72440)

Summary:
The device allocator on NVPTX architectures is enqueued to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guaranteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.

I had originally tried to make the `alloc` and `free` methods take a
`__tgt_async_info`. However, I observed that with the large volume of
streams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.

GitOrigin-RevId: fb32977ac768f27890af28308a6968c30af2aa3e
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index 476a158..d5602ee 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -103,7 +103,9 @@
   TARGET_ALLOC_DEVICE = 0,
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
-  TARGET_ALLOC_DEFAULT
+  TARGET_ALLOC_DEFAULT,
+  /// The allocation will not block on other streams.
+  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
diff --git a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fe435a3..0411c67 100644
--- a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2112,6 +2112,7 @@
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:
diff --git a/libomptarget/plugins-nextgen/common/src/RPC.cpp b/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 60e0540..54aced1 100644
--- a/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -62,15 +62,14 @@
         "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
 
   // Register a custom opcode handler to perform plugin specific allocation.
-  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
   auto MallocHandler = [](rpc_port_t Port, void *Data) {
     rpc_recv_and_send(
         Port,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(
-              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
         },
         Data);
   };
@@ -88,7 +87,7 @@
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
           Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE);
+                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
         },
         Data);
   };
diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 56c4404..5ec3adb 100644
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -43,6 +43,7 @@
 DLWRAP(cuMemAlloc, 2)
 DLWRAP(cuMemAllocHost, 2)
 DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
 
 DLWRAP(cuMemcpyDtoDAsync, 4)
 DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@
 
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
 DLWRAP(cuModuleGetFunction, 3)
 DLWRAP(cuModuleGetGlobal, 4)
 
diff --git a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e03077..32031c2 100644
--- a/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
 CUresult cuMemAllocHost(void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
 
 CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@
 
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
 
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
 CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index b0dff91..0005bff 100644
--- a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -63,6 +63,14 @@
                               CUmemAllocationGranularity_flags option) {}
 #endif
 
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+        break;
+      cuStreamSynchronize(Stream);
+      Res = cuStreamDestroy(Stream);
+      MemAlloc = (void *)DevicePtr;
+    }
     }
 
     if (auto Err =
@@ -518,6 +536,15 @@
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+        break;
+      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+      cuStreamSynchronize(Stream);
+      if ((Res = cuStreamDestroy(Stream)))
+        break;
+    }
     }
 
     if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 88b5236..43569f2 100644
--- a/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -215,6 +215,7 @@
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }
diff --git a/libomptarget/test/libc/malloc.c b/libomptarget/test/libc/malloc.c
index c18a724..b587b61 100644
--- a/libomptarget/test/libc/malloc.c
+++ b/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@
   unsigned *d_x;
 #pragma omp target map(from : d_x)
   {
-    d_x = malloc(sizeof(unsigned));
+    d_x = (unsigned *)malloc(sizeof(unsigned));
     *d_x = 1;
   }
 
@@ -23,6 +23,14 @@
 #pragma omp target is_device_ptr(d_x)
   { free(d_x); }
 
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+  {
+    int *ptr = (int *)malloc(sizeof(int));
+    *ptr = 42;
+    free(ptr);
+  }
+
   // CHECK: PASS
   if (h_x == 1)
     fputs("PASS\n", stdout);