[libomptarget] Add allocator support for target memory

This patch adds the infrastructure for allocator support for target memory.
Three allocators are introduced for device, host and shared memory.
The corresponding API functions have the llvm_ prefix temporarily, until they become part of the OpenMP standard.

Differential Revision: https://reviews.llvm.org/D97883

GitOrigin-RevId: 2468fdd9af361cb46d02d00a52e87067e7078127
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index 7b317c4..9e3667e 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -86,6 +86,13 @@
   OMP_REQ_DYNAMIC_ALLOCATORS      = 0x010
 };
 
+enum TargetAllocTy : int32_t {
+  TARGET_ALLOC_DEVICE = 0,
+  TARGET_ALLOC_HOST,
+  TARGET_ALLOC_SHARED,
+  TARGET_ALLOC_DEFAULT
+};
+
 /// This struct is a record of an entry point or global. For a function
 /// entry point the size is expected to be zero
 struct __tgt_offload_entry {
@@ -190,6 +197,12 @@
                              size_t device_offset, int device_num);
 int omp_target_disassociate_ptr(void *host_ptr, int device_num);
 
+/// Explicit target memory allocators
+/// Using the llvm_ prefix until they become part of the OpenMP standard.
+void *llvm_omp_target_alloc_device(size_t size, int device_num);
+void *llvm_omp_target_alloc_host(size_t size, int device_num);
+void *llvm_omp_target_alloc_shared(size_t size, int device_num);
+
 /// add the clauses of the requires directives in a given file
 void __tgt_register_requires(int64_t flags);
 
diff --git a/libomptarget/include/omptargetplugin.h b/libomptarget/include/omptargetplugin.h
index a315cdd..721b9d5 100644
--- a/libomptarget/include/omptargetplugin.h
+++ b/libomptarget/include/omptargetplugin.h
@@ -65,8 +65,10 @@
 // initialize the target data mapping structures. These addresses are
 // used to generate a table of target variables to pass to
 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
-// case an error occurred on the target device.
-void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
+// case an error occurred on the target device. Kind dictates what allocator
+// to use (e.g. shared, host, device).
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr,
+                           int32_t Kind);
 
 // Pass the data content to the target device using the target address. In case
 // of success, return zero. Otherwise, return an error code.
diff --git a/libomptarget/plugins/amdgpu/src/rtl.cpp b/libomptarget/plugins/amdgpu/src/rtl.cpp
index 68d7913..0e8df9e 100644
--- a/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -1488,9 +1488,16 @@
   return DeviceInfo.getOffloadEntriesTable(device_id);
 }
 
-void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *) {
+void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
   void *ptr = NULL;
   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
+
   atmi_status_t err = atmi_malloc(&ptr, size, get_gpu_mem_place(device_id));
   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
      (long long unsigned)(Elf64_Addr)ptr);
diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp
index 5647099..3d0424f 100644
--- a/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/libomptarget/plugins/cuda/src/rtl.cpp
@@ -1095,9 +1095,16 @@
   return DeviceRTL.loadBinary(device_id, image);
 }
 
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
+                           int32_t kind) {
   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
+
   return DeviceRTL.dataAlloc(device_id, size);
 }
 
diff --git a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index eb6ebc1..27cb39c 100644
--- a/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -250,8 +250,23 @@
   return DeviceInfo.getOffloadEntriesTable(device_id);
 }
 
-void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
-  void *ptr = malloc(size);
+// Sample implementation of explicit memory allocator. For this plugin all kinds
+// are equivalent to each other.
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr,
+                           int32_t kind) {
+  void *ptr = NULL;
+
+  switch (kind) {
+  case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_HOST:
+  case TARGET_ALLOC_SHARED:
+  case TARGET_ALLOC_DEFAULT:
+    ptr = malloc(size);
+    break;
+  default:
+    REPORT("Invalid target data allocation kind");
+  }
+
   return ptr;
 }
 
diff --git a/libomptarget/plugins/remote/src/rtl.cpp b/libomptarget/plugins/remote/src/rtl.cpp
index 20c415b..26f172a 100644
--- a/libomptarget/plugins/remote/src/rtl.cpp
+++ b/libomptarget/plugins/remote/src/rtl.cpp
@@ -84,7 +84,14 @@
   return Manager->isDataExchangeable(SrcDevId, DstDevId);
 }
 
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr) {
+void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
+                           int32_t kind) {
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
+
   return Manager->dataAlloc(DeviceId, Size, HstPtr);
 }
 
diff --git a/libomptarget/plugins/ve/src/rtl.cpp b/libomptarget/plugins/ve/src/rtl.cpp
index a77cd31..2b9c17e 100644
--- a/libomptarget/plugins/ve/src/rtl.cpp
+++ b/libomptarget/plugins/ve/src/rtl.cpp
@@ -330,10 +330,17 @@
 // used to generate a table of target variables to pass to
 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
 // case an error occurred on the target device.
-void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr,
+                           int32_t kind) {
   int ret;
   uint64_t addr;
 
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
+
   if (DeviceInfo.ProcHandles[ID] == NULL) {
     struct veo_proc_handle *proc_handle;
     proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
diff --git a/libomptarget/src/api.cpp b/libomptarget/src/api.cpp
index adacc5a..3c6142a 100644
--- a/libomptarget/src/api.cpp
+++ b/libomptarget/src/api.cpp
@@ -38,31 +38,19 @@
 }
 
 EXTERN void *omp_target_alloc(size_t size, int device_num) {
-  TIMESCOPE();
-  DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
-     device_num, size);
+  return targetAllocExplicit(size, device_num, TARGET_ALLOC_DEFAULT, __func__);
+}
 
-  if (size <= 0) {
-    DP("Call to omp_target_alloc with non-positive length\n");
-    return NULL;
-  }
+EXTERN void *llvm_omp_target_alloc_device(size_t size, int device_num) {
+  return targetAllocExplicit(size, device_num, TARGET_ALLOC_DEVICE, __func__);
+}
 
-  void *rc = NULL;
+EXTERN void *llvm_omp_target_alloc_host(size_t size, int device_num) {
+  return targetAllocExplicit(size, device_num, TARGET_ALLOC_HOST, __func__);
+}
 
-  if (device_num == omp_get_initial_device()) {
-    rc = malloc(size);
-    DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
-    return rc;
-  }
-
-  if (!device_is_ready(device_num)) {
-    DP("omp_target_alloc returns NULL ptr\n");
-    return NULL;
-  }
-
-  rc = PM->Devices[device_num].allocData(size);
-  DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
-  return rc;
+EXTERN void *llvm_omp_target_alloc_shared(size_t size, int device_num) {
+  return targetAllocExplicit(size, device_num, TARGET_ALLOC_SHARED, __func__);
 }
 
 EXTERN void omp_target_free(void *device_ptr, int device_num) {
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index 50017ac..8f605a0 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -405,8 +405,8 @@
   return rc;
 }
 
-void *DeviceTy::allocData(int64_t Size, void *HstPtr) {
-  return RTL->data_alloc(RTLDeviceID, Size, HstPtr);
+void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
+  return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
 }
 
 int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h
index 7ecdec7..32d1e96 100644
--- a/libomptarget/src/device.h
+++ b/libomptarget/src/device.h
@@ -185,13 +185,16 @@
   __tgt_target_table *load_binary(void *Img);
 
   // device memory allocation/deallocation routines
-  /// Allocates \p Size bytes on the device and returns the address/nullptr when
+  /// Allocates \p Size bytes on the device, host or shared memory space
+  /// (depending on \p Kind) and returns the address/nullptr when
   /// succeeds/fails. \p HstPtr is an address of the host data which the
   /// allocated target data will be associated with. If it is unknown, the
   /// default value of \p HstPtr is nullptr. Note: this function doesn't do
   /// pointer association. Actually, all the __tgt_rtl_data_alloc
-  /// implementations ignore \p HstPtr.
-  void *allocData(int64_t Size, void *HstPtr = nullptr);
+  /// implementations ignore \p HstPtr. \p Kind dictates what allocator should
+  /// be used (host, shared, device).
+  void *allocData(int64_t Size, void *HstPtr = nullptr,
+                  int32_t Kind = TARGET_ALLOC_DEFAULT);
   /// Deallocates memory which \p TgtPtrBegin points at and returns
   /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
   int32_t deleteData(void *TgtPtrBegin);
diff --git a/libomptarget/src/exports b/libomptarget/src/exports
index b7fc1c8..7992daa 100644
--- a/libomptarget/src/exports
+++ b/libomptarget/src/exports
@@ -36,6 +36,9 @@
     omp_target_memcpy_rect;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
+    llvm_omp_target_alloc_host;
+    llvm_omp_target_alloc_shared;
+    llvm_omp_target_alloc_device;
   local:
     *;
 };
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index 64a5292..2c6af57 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -328,6 +328,35 @@
   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
 }
 
+void *targetAllocExplicit(size_t size, int device_num, int kind,
+                          const char *name) {
+  TIMESCOPE();
+  DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
+
+  if (size <= 0) {
+    DP("Call to %s with non-positive length\n", name);
+    return NULL;
+  }
+
+  void *rc = NULL;
+
+  if (device_num == omp_get_initial_device()) {
+    rc = malloc(size);
+    DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
+    return rc;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("%s returns NULL ptr\n", name);
+    return NULL;
+  }
+
+  DeviceTy &Device = PM->Devices[device_num];
+  rc = Device.allocData(size, nullptr, kind);
+  DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
+  return rc;
+}
+
 /// Call the user-defined mapper function followed by the appropriate
 // targetData* function (targetData{Begin,End,Update}).
 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h
index 8e4db65..a97d701 100644
--- a/libomptarget/src/private.h
+++ b/libomptarget/src/private.h
@@ -46,6 +46,8 @@
 
 extern void handleTargetOutcome(bool Success, ident_t *Loc);
 extern int checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc);
+extern void *targetAllocExplicit(size_t size, int device_num, int kind,
+                                 const char *name);
 
 // This structure stores information of a mapped memory region.
 struct MapComponentInfoTy {
diff --git a/libomptarget/src/rtl.h b/libomptarget/src/rtl.h
index a67b868..ae11eee 100644
--- a/libomptarget/src/rtl.h
+++ b/libomptarget/src/rtl.h
@@ -30,7 +30,7 @@
   typedef int32_t(number_of_devices_ty)();
   typedef int32_t(init_device_ty)(int32_t);
   typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
-  typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
+  typedef void *(data_alloc_ty)(int32_t, int64_t, void *, int32_t);
   typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
   typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t,
                                         __tgt_async_info *);