[OpenMP] Unify omptarget API and usage wrt. `__tgt_async_info`

This patch unifies our libomptarget API in two ways:
  - always pass a `__tgt_async_info` object, the Queue member decides if
    it is in use or not.
  - (almost) always synchronize in the interface layer and not in the
    omptarget layer.

A side effect is that we now put all constructor and static initializer
kernels in a stream too, if the device utilizes `__tgt_async_info`.

The patch contains a TODO which can be addressed as we add support for
asynchronous malloc and free in the plugin API. This is the only
`synchronizeAsyncInfo` left in the omptarget layer.

Site note: On a V100 system the GridMini performance for small sizes
more than doubled.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D96379

GitOrigin-RevId: 758b8499310a5d44a41de67d656bd32dc3fd1023
diff --git a/libomptarget/src/api.cpp b/libomptarget/src/api.cpp
index 27fec34..adacc5a 100644
--- a/libomptarget/src/api.cpp
+++ b/libomptarget/src/api.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "device.h"
+#include "omptarget.h"
 #include "private.h"
 #include "rtl.h"
 
@@ -171,11 +172,13 @@
   } else if (src_device == omp_get_initial_device()) {
     DP("copy from host to device\n");
     DeviceTy &DstDev = PM->Devices[dst_device];
-    rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
+    AsyncInfoTy AsyncInfo(DstDev);
+    rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo);
   } else if (dst_device == omp_get_initial_device()) {
     DP("copy from device to host\n");
     DeviceTy &SrcDev = PM->Devices[src_device];
-    rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
+    AsyncInfoTy AsyncInfo(SrcDev);
+    rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo);
   } else {
     DP("copy from device to device\n");
     DeviceTy &SrcDev = PM->Devices[src_device];
@@ -183,15 +186,21 @@
     // First try to use D2D memcpy which is more efficient. If fails, fall back
     // to unefficient way.
     if (SrcDev.isDataExchangable(DstDev)) {
-      rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
+      AsyncInfoTy AsyncInfo(SrcDev);
+      rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo);
       if (rc == OFFLOAD_SUCCESS)
         return OFFLOAD_SUCCESS;
     }
 
     void *buffer = malloc(length);
-    rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
-    if (rc == OFFLOAD_SUCCESS)
-      rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
+    {
+      AsyncInfoTy AsyncInfo(SrcDev);
+      rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo);
+    }
+    if (rc == OFFLOAD_SUCCESS) {
+      AsyncInfoTy AsyncInfo(SrcDev);
+      rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo);
+    }
     free(buffer);
   }
 
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index 724b887..50017ac 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -415,27 +415,27 @@
 
 // Submit data to device
 int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
-                             __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
+                             AsyncInfoTy &AsyncInfo) {
+  if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
     return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
   else
     return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
-                                  AsyncInfoPtr);
+                                  AsyncInfo);
 }
 
 // Retrieve data from device
 int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
-                               int64_t Size, __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
+                               int64_t Size, AsyncInfoTy &AsyncInfo) {
+  if (!RTL->data_retrieve_async || !RTL->synchronize)
     return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
   else
     return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
-                                    AsyncInfoPtr);
+                                    AsyncInfo);
 }
 
 // Copy data from current device to destination device directly
 int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
-                               int64_t Size, __tgt_async_info *AsyncInfo) {
+                               int64_t Size, AsyncInfoTy &AsyncInfo) {
   if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
     assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
     return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
@@ -448,13 +448,13 @@
 // Run region on device
 int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
                             ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
-                            __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
+                            AsyncInfoTy &AsyncInfo) {
+  if (!RTL->run_region || !RTL->synchronize)
     return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
                            TgtVarsSize);
   else
     return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
-                                 TgtOffsets, TgtVarsSize, AsyncInfoPtr);
+                                 TgtOffsets, TgtVarsSize, AsyncInfo);
 }
 
 // Run team region on device.
@@ -462,15 +462,15 @@
                                 ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
                                 int32_t NumTeams, int32_t ThreadLimit,
                                 uint64_t LoopTripCount,
-                                __tgt_async_info *AsyncInfoPtr) {
-  if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
+                                AsyncInfoTy &AsyncInfo) {
+  if (!RTL->run_team_region_async || !RTL->synchronize)
     return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
                                 TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
                                 LoopTripCount);
   else
     return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
                                       TgtOffsets, TgtVarsSize, NumTeams,
-                                      ThreadLimit, LoopTripCount, AsyncInfoPtr);
+                                      ThreadLimit, LoopTripCount, AsyncInfo);
 }
 
 // Whether data can be copied to DstDevice directly
@@ -485,9 +485,9 @@
   return false;
 }
 
-int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
+int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
   if (RTL->synchronize)
-    return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
+    return RTL->synchronize(RTLDeviceID, AsyncInfo);
   return OFFLOAD_SUCCESS;
 }
 
diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h
index 06e24e1..84a0987 100644
--- a/libomptarget/src/device.h
+++ b/libomptarget/src/device.h
@@ -22,13 +22,13 @@
 #include <set>
 #include <vector>
 
+#include "omptarget.h"
 #include "rtl.h"
 
 // Forward declarations.
 struct RTLInfoTy;
 struct __tgt_bin_desc;
 struct __tgt_target_table;
-struct __tgt_async_info;
 
 using map_var_info_t = void *;
 
@@ -200,24 +200,24 @@
   // synchronous.
   // Copy data from host to device
   int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
-                     __tgt_async_info *AsyncInfoPtr);
+                     AsyncInfoTy &AsyncInfo);
   // Copy data from device back to host
   int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
-                       __tgt_async_info *AsyncInfoPtr);
+                       AsyncInfoTy &AsyncInfo);
   // Copy data from current device to destination device directly
   int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
-                       int64_t Size, __tgt_async_info *AsyncInfo);
+                       int64_t Size, AsyncInfoTy &AsyncInfo);
 
   int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
-                    int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
+                    int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
   int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
                         ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
                         int32_t NumTeams, int32_t ThreadLimit,
-                        uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
+                        uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
 
   /// Synchronize device/queue/event based on \p AsyncInfoPtr and return
   /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
-  int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
+  int32_t synchronize(AsyncInfoTy &AsyncInfo);
 
 private:
   // Call to RTL
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index 0140c55..095c4d3 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "device.h"
+#include "omptarget.h"
 #include "private.h"
 #include "rtl.h"
 
@@ -183,8 +184,11 @@
   }
 #endif
 
+  AsyncInfoTy AsyncInfo(Device);
   int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
-                           arg_types, arg_names, arg_mappers, nullptr);
+                           arg_types, arg_names, arg_mappers, AsyncInfo);
+  if (rc == OFFLOAD_SUCCESS)
+    rc = AsyncInfo.synchronize();
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
@@ -270,8 +274,11 @@
   }
 #endif
 
+  AsyncInfoTy AsyncInfo(Device);
   int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
-                         arg_types, arg_names, arg_mappers, nullptr);
+                         arg_types, arg_names, arg_mappers, AsyncInfo);
+  if (rc == OFFLOAD_SUCCESS)
+    rc = AsyncInfo.synchronize();
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
@@ -335,8 +342,11 @@
                          arg_names, "Updating OpenMP data");
 
   DeviceTy &Device = PM->Devices[device_id];
+  AsyncInfoTy AsyncInfo(Device);
   int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
-                            arg_types, arg_names, arg_mappers, nullptr);
+                            arg_types, arg_names, arg_mappers, AsyncInfo);
+  if (rc == OFFLOAD_SUCCESS)
+    rc = AsyncInfo.synchronize();
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
@@ -408,9 +418,12 @@
 #endif
 
   DeviceTy &Device = PM->Devices[device_id];
-  int rc =
-      target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
-             arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr);
+  AsyncInfoTy AsyncInfo(Device);
+  int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
+                  arg_types, arg_names, arg_mappers, 0, 0, false /*team*/,
+                  AsyncInfo);
+  if (rc == OFFLOAD_SUCCESS)
+    rc = AsyncInfo.synchronize();
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
   return rc;
 }
@@ -490,9 +503,12 @@
 #endif
 
   DeviceTy &Device = PM->Devices[device_id];
+  AsyncInfoTy AsyncInfo(Device);
   int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
                   arg_types, arg_names, arg_mappers, team_num, thread_limit,
-                  true /*team*/, nullptr);
+                  true /*team*/, AsyncInfo);
+  if (rc == OFFLOAD_SUCCESS)
+    rc = AsyncInfo.synchronize();
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
   return rc;
 }
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index dc49345..51dcfec 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -23,7 +23,7 @@
   int Result = OFFLOAD_SUCCESS;
   if (AsyncInfo.Queue) {
     // If we have a queue we need to synchronize it now.
-    Result = Device.synchronize(&AsyncInfo);
+    Result = Device.synchronize(*this);
     assert(AsyncInfo.Queue == nullptr &&
            "The device plugin should have nulled the queue to indicate there "
            "are no outstanding actions!");
@@ -166,6 +166,7 @@
    * Run ctors for static objects
    */
   if (!Device.PendingCtorsDtors.empty()) {
+    AsyncInfoTy AsyncInfo(Device);
     // Call all ctors for all libraries registered so far
     for (auto &lib : Device.PendingCtorsDtors) {
       if (!lib.second.PendingCtors.empty()) {
@@ -174,7 +175,7 @@
           void *ctor = entry;
           int rc =
               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
-                     nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
+                     nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
           if (rc != OFFLOAD_SUCCESS) {
             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
             Device.PendingGlobalsMtx.unlock();
@@ -186,6 +187,9 @@
         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
       }
     }
+    // All constructors have been issued, wait for them now.
+    if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
   }
   Device.HasPendingGlobals = false;
   Device.PendingGlobalsMtx.unlock();
@@ -226,6 +230,7 @@
 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
                      int64_t arg_size, int64_t arg_type,
                      map_var_info_t arg_names, void *arg_mapper,
+                     AsyncInfoTy &AsyncInfo,
                      TargetDataFuncPtrTy target_data_function) {
   TIMESCOPE_WITH_IDENT(loc);
   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
@@ -256,11 +261,10 @@
     MapperArgNames[I] = C.Name;
   }
 
-  int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
-                                MapperArgsBase.data(), MapperArgs.data(),
-                                MapperArgSizes.data(), MapperArgTypes.data(),
-                                MapperArgNames.data(), /*arg_mappers*/ nullptr,
-                                /* AsyncInfoTy */ nullptr);
+  int rc = target_data_function(
+      loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(),
+      MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(),
+      MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo);
 
   return rc;
 }
@@ -269,7 +273,7 @@
 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
                     void **args_base, void **args, int64_t *arg_sizes,
                     int64_t *arg_types, map_var_info_t *arg_names,
-                    void **arg_mappers, AsyncInfoTy *AsyncInfo) {
+                    void **arg_mappers, AsyncInfoTy &AsyncInfo) {
   // process each input.
   for (int32_t i = 0; i < arg_num; ++i) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -286,7 +290,7 @@
       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
                                 arg_sizes[i], arg_types[i], arg_name,
-                                arg_mappers[i], targetDataBegin);
+                                arg_mappers[i], AsyncInfo, targetDataBegin);
 
       if (rc != OFFLOAD_SUCCESS) {
         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@@ -416,7 +420,7 @@
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
         int rt =
-            Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo);
+            Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
         if (rt != OFFLOAD_SUCCESS) {
           REPORT("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
@@ -430,7 +434,7 @@
       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
       int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
-                                 sizeof(void *), *AsyncInfo);
+                                 sizeof(void *), AsyncInfo);
       if (rt != OFFLOAD_SUCCESS) {
         REPORT("Copying data to device failed.\n");
         return OFFLOAD_FAIL;
@@ -470,7 +474,7 @@
 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
-                  void **ArgMappers, AsyncInfoTy *AsyncInfo) {
+                  void **ArgMappers, AsyncInfoTy &AsyncInfo) {
   int Ret;
   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
   // process each input.
@@ -488,9 +492,9 @@
       DP("Calling targetDataMapper for the %dth argument\n", I);
 
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
-      Ret =
-          targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
-                           ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
+      Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
+                             ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
+                             targetDataEnd);
 
       if (Ret != OFFLOAD_SUCCESS) {
         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
@@ -585,7 +589,7 @@
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
-                                    *AsyncInfo);
+                                    AsyncInfo);
           if (Ret != OFFLOAD_SUCCESS) {
             REPORT("Copying data from device failed.\n");
             return OFFLOAD_FAIL;
@@ -637,17 +641,13 @@
     }
   }
 
+  // TODO: We should not synchronize here but pass the AsyncInfo object to the
+  //       allocate/deallocate device APIs.
+  //
   // We need to synchronize before deallocating data.
-  // If AsyncInfo is nullptr, the previous data transfer (if has) will be
-  // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
-  // nullptr, there is no data transfer happened because once there is,
-  // AsyncInfo->Queue will not be nullptr, so again, we don't need to
-  // synchronize.
-  if (AsyncInfo) {
-    Ret = AsyncInfo->synchronize();
-    if (Ret != OFFLOAD_SUCCESS)
-      return OFFLOAD_FAIL;
-  }
+  Ret = AsyncInfo.synchronize();
+  if (Ret != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
 
   // Deallocate target pointer
   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
@@ -664,7 +664,7 @@
 
 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
                                 void *HstPtrBegin, int64_t ArgSize,
-                                int64_t ArgType) {
+                                int64_t ArgType, AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_IDENT(loc);
   bool IsLast, IsHostPtr;
   void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
@@ -690,7 +690,7 @@
   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-    int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
+    int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
     if (Ret != OFFLOAD_SUCCESS) {
       REPORT("Copying data from device failed.\n");
       return OFFLOAD_FAIL;
@@ -717,7 +717,7 @@
   if (ArgType & OMP_TGT_MAPTYPE_TO) {
     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-    int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
+    int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
     if (Ret != OFFLOAD_SUCCESS) {
       REPORT("Copying data to device failed.\n");
       return OFFLOAD_FAIL;
@@ -737,7 +737,7 @@
          "pointer " DPxMOD "\n",
          DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
       Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
-                              sizeof(void *), nullptr);
+                              sizeof(void *), AsyncInfo);
       if (Ret != OFFLOAD_SUCCESS) {
         REPORT("Copying data to device failed.\n");
         Device.ShadowMtx.unlock();
@@ -753,8 +753,8 @@
                                    void *ArgsBase,
                                    __tgt_target_non_contig *NonContig,
                                    uint64_t Size, int64_t ArgType,
-                                   int CurrentDim, int DimSize,
-                                   uint64_t Offset) {
+                                   int CurrentDim, int DimSize, uint64_t Offset,
+                                   AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_IDENT(loc);
   int Ret = OFFLOAD_SUCCESS;
   if (CurrentDim < DimSize) {
@@ -766,7 +766,7 @@
       if (CurrentDim != DimSize - 1 || I == 0) {
         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
                                       ArgType, CurrentDim + 1, DimSize,
-                                      Offset + CurOffset);
+                                      Offset + CurOffset, AsyncInfo);
         // Stop the whole process if any contiguous piece returns anything
         // other than OFFLOAD_SUCCESS.
         if (Ret != OFFLOAD_SUCCESS)
@@ -778,7 +778,8 @@
     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
        " len %" PRIu64 "\n",
        DPxPTR(Ptr), Offset, Size);
-    Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType);
+    Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
+                               AsyncInfo);
   }
   return Ret;
 }
@@ -794,12 +795,10 @@
 }
 
 /// Internal function to pass data to/from the target.
-// AsyncInfo is currently unused, added here so targetDataUpdate has the
-// same signature as targetDataBegin and targetDataEnd.
 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
                      void **ArgsBase, void **Args, int64_t *ArgSizes,
                      int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers, AsyncInfoTy *AsyncInfo) {
+                     void **ArgMappers, AsyncInfoTy &AsyncInfo) {
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -814,7 +813,7 @@
 
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
-                                 ArgTypes[I], ArgName, ArgMappers[I],
+                                 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
                                  targetDataUpdate);
 
       if (Ret != OFFLOAD_SUCCESS) {
@@ -837,10 +836,10 @@
       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
       Ret = targetDataNonContiguous(
           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
-          /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
+          /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
     } else {
       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
-                                 ArgTypes[I]);
+                                 ArgTypes[I], AsyncInfo);
     }
     if (Ret == OFFLOAD_FAIL)
       return OFFLOAD_FAIL;
@@ -950,7 +949,7 @@
   /// A reference to the \p DeviceTy object
   DeviceTy &Device;
   /// A pointer to a \p AsyncInfoTy object
-  AsyncInfoTy *AsyncInfo;
+  AsyncInfoTy &AsyncInfo;
 
   // TODO: What would be the best value here? Should we make it configurable?
   // If the size is larger than this threshold, we will allocate and transfer it
@@ -959,7 +958,7 @@
 
 public:
   /// Constructor
-  PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo)
+  PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
       : Device(Dev), AsyncInfo(AsyncInfo) {}
 
   /// Add a private argument
@@ -986,7 +985,7 @@
 #endif
       // If first-private, copy data from host
       if (IsFirstPrivate) {
-        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo);
+        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed, failed.\n");
           return OFFLOAD_FAIL;
@@ -1042,7 +1041,7 @@
          FirstPrivateArgSize, DPxPTR(TgtPtr));
       // Transfer data to target device
       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
-                                  FirstPrivateArgSize, *AsyncInfo);
+                                  FirstPrivateArgSize, AsyncInfo);
       if (Ret != OFFLOAD_SUCCESS) {
         DP("Failed to submit data of private arguments.\n");
         return OFFLOAD_FAIL;
@@ -1090,7 +1089,7 @@
                              std::vector<void *> &TgtArgs,
                              std::vector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
-                             AsyncInfoTy *AsyncInfo) {
+                             AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
   DeviceTy &Device = PM->Devices[DeviceId];
   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
@@ -1141,7 +1140,7 @@
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
-                                sizeof(void *), *AsyncInfo);
+                                sizeof(void *), AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
           REPORT("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
@@ -1211,7 +1210,7 @@
                             int64_t *ArgSizes, int64_t *ArgTypes,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
-                            AsyncInfoTy *AsyncInfo) {
+                            AsyncInfoTy &AsyncInfo) {
   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
   DeviceTy &Device = PM->Devices[DeviceId];
 
@@ -1243,7 +1242,7 @@
 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
-           int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) {
+           int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
   int32_t DeviceId = Device.DeviceID;
 
   TableMap *TM = getTableMap(HostPtr);
@@ -1264,12 +1263,6 @@
   }
   assert(TargetTable && "Global data has not been mapped\n");
 
-  // TODO: This will go away as soon as we consequently pass in async info
-  // objects (as references).
-  AsyncInfoTy InternalAsyncInfo(Device);
-  if (!AsyncInfo)
-    AsyncInfo = &InternalAsyncInfo;
-
   std::vector<void *> TgtArgs;
   std::vector<ptrdiff_t> TgtOffsets;
 
@@ -1301,10 +1294,10 @@
     if (IsTeamConstruct)
       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
                                  TgtArgs.size(), TeamNum, ThreadLimit,
-                                 LoopTripCount, *AsyncInfo);
+                                 LoopTripCount, AsyncInfo);
     else
       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
-                             TgtArgs.size(), *AsyncInfo);
+                             TgtArgs.size(), AsyncInfo);
   }
 
   if (Ret != OFFLOAD_SUCCESS) {
@@ -1322,13 +1315,6 @@
       REPORT("Failed to process data after launching the kernel.\n");
       return OFFLOAD_FAIL;
     }
-  } else {
-    // TODO: We should not synchronize here but on the outer level once we pass
-    // in a reference AsyncInfo object.
-    // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
-    // hava any argument, and the device supports async operations, so we need a
-    // sync at this point.
-    return AsyncInfo->synchronize();
   }
 
   return OFFLOAD_SUCCESS;
diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h
index 746eea2..fc6997a 100644
--- a/libomptarget/src/private.h
+++ b/libomptarget/src/private.h
@@ -23,23 +23,23 @@
 extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
                            void **args_base, void **args, int64_t *arg_sizes,
                            int64_t *arg_types, map_var_info_t *arg_names,
-                           void **arg_mappers, AsyncInfoTy *AsyncInfo);
+                           void **arg_mappers, AsyncInfoTy &AsyncInfo);
 
 extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
                          void **ArgBases, void **Args, int64_t *ArgSizes,
                          int64_t *ArgTypes, map_var_info_t *arg_names,
-                         void **ArgMappers, AsyncInfoTy *AsyncInfo);
+                         void **ArgMappers, AsyncInfoTy &AsyncInfo);
 
 extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
                             void **args_base, void **args, int64_t *arg_sizes,
                             int64_t *arg_types, map_var_info_t *arg_names,
-                            void **arg_mappers, AsyncInfoTy *AsyncInfo);
+                            void **arg_mappers, AsyncInfoTy &AsyncInfo);
 
 extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *arg_names,
                   void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
-                  int IsTeamConstruct, AsyncInfoTy *AsyncInfo);
+                  int IsTeamConstruct, AsyncInfoTy &AsyncInfo);
 
 extern int CheckDeviceAndCtors(int64_t device_id);
 
@@ -76,7 +76,7 @@
 // targetDataEnd and targetDataUpdate).
 typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
                                    void **, int64_t *, int64_t *,
-                                   map_var_info_t *, void **, AsyncInfoTy *);
+                                   map_var_info_t *, void **, AsyncInfoTy &);
 
 // Implemented in libomp, they are called from within __tgt_* functions.
 #ifdef __cplusplus
diff --git a/libomptarget/src/rtl.cpp b/libomptarget/src/rtl.cpp
index efbac2c..60f3d49 100644
--- a/libomptarget/src/rtl.cpp
+++ b/libomptarget/src/rtl.cpp
@@ -400,16 +400,20 @@
         DeviceTy &Device = PM->Devices[FoundRTL->Idx + i];
         Device.PendingGlobalsMtx.lock();
         if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
+          AsyncInfoTy AsyncInfo(Device);
           for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
-            int rc =
-                target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
-                       nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
+            int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
+                            nullptr, nullptr, nullptr, 1, 1, true /*team*/,
+                            AsyncInfo);
             if (rc != OFFLOAD_SUCCESS) {
               DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
             }
           }
           // Remove this library's entry from PendingCtorsDtors
           Device.PendingCtorsDtors.erase(desc);
+          // All constructors have been issued, wait for them now.
+          if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+            DP("Failed synchronizing destructors kernels.\n");
         }
         Device.PendingGlobalsMtx.unlock();
       }