[OpenMP] Add source location information to the libomptarget profile

In much of the libomptarget interface we have an ident_t object now, if
it is not null we can use it to improve the profile output. For now, we
simply use the ident_t "source information string" as generated by the
FE.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95282

GitOrigin-RevId: 8c7fdc4c61bff94a3ac1bb4877d1c00e01ee53be
diff --git a/libomptarget/include/SourceInfo.h b/libomptarget/include/SourceInfo.h
index 32f1159..7d30a04 100644
--- a/libomptarget/include/SourceInfo.h
+++ b/libomptarget/include/SourceInfo.h
@@ -91,6 +91,7 @@
 
   const char *getName() const { return Name.c_str(); }
   const char *getFilename() const { return Filename.c_str(); }
+  const char *getProfileLocation() const { return SourceStr.data(); }
   int32_t getLine() const { return Line; }
   int32_t getColumn() const { return Column; }
   bool isAvailible() const { return (Line || Column); }
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index c773e1f..85a289c 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -132,7 +132,7 @@
                                            int64_t *arg_types,
                                            map_var_info_t *arg_names,
                                            void **arg_mappers) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled()) return;
 
   DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
@@ -164,7 +164,7 @@
   }
 #endif
 
-  int rc = targetDataBegin(Device, arg_num, args_base, args, arg_sizes,
+  int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
                            arg_types, arg_names, arg_mappers, nullptr);
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
@@ -174,7 +174,7 @@
     void **args, int64_t *arg_sizes, int64_t *arg_types,
     map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
     void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
@@ -210,7 +210,7 @@
                                          int64_t *arg_types,
                                          map_var_info_t *arg_names,
                                          void **arg_mappers) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled()) return;
   DP("Entering data end region with %d mappings\n", arg_num);
 
@@ -247,8 +247,8 @@
   }
 #endif
 
-  int rc = targetDataEnd(Device, arg_num, args_base, args, arg_sizes, arg_types,
-                         arg_names, arg_mappers, nullptr);
+  int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
+                         arg_types, arg_names, arg_mappers, nullptr);
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
 
@@ -257,7 +257,7 @@
     void **args, int64_t *arg_sizes, int64_t *arg_types,
     map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
     void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
@@ -290,7 +290,7 @@
                                             int64_t *arg_types,
                                             map_var_info_t *arg_names,
                                             void **arg_mappers) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled()) return;
   DP("Entering data update with %d mappings\n", arg_num);
 
@@ -310,7 +310,7 @@
                          arg_names, "Updating OpenMP data");
 
   DeviceTy &Device = PM->Devices[device_id];
-  int rc = targetDataUpdate(Device, arg_num, args_base, args, arg_sizes,
+  int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
                             arg_types, arg_names, arg_mappers);
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
 }
@@ -320,7 +320,7 @@
     void **args, int64_t *arg_sizes, int64_t *arg_types,
     map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
     void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
@@ -351,7 +351,7 @@
                                int32_t arg_num, void **args_base, void **args,
                                int64_t *arg_sizes, int64_t *arg_types,
                                map_var_info_t *arg_names, void **arg_mappers) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled()) return OFFLOAD_FAIL;
   DP("Entering target region with entry point " DPxMOD " and device Id %"
       PRId64 "\n", DPxPTR(host_ptr), device_id);
@@ -378,7 +378,7 @@
   }
 #endif
 
-  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+  int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes,
                   arg_types, arg_names, arg_mappers, 0, 0, false /*team*/);
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
   return rc;
@@ -389,7 +389,7 @@
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
     map_var_info_t *arg_names, void **arg_mappers, int32_t depNum,
     void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
@@ -426,7 +426,6 @@
                                      map_var_info_t *arg_names,
                                      void **arg_mappers, int32_t team_num,
                                      int32_t thread_limit) {
-  TIMESCOPE();
   if (IsOffloadDisabled()) return OFFLOAD_FAIL;
   DP("Entering target region with entry point " DPxMOD " and device Id %"
       PRId64 "\n", DPxPTR(host_ptr), device_id);
@@ -453,7 +452,7 @@
   }
 #endif
 
-  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+  int rc = target(loc, device_id, host_ptr, arg_num, args_base, args, arg_sizes,
                   arg_types, arg_names, arg_mappers, team_num, thread_limit,
                   true /*team*/);
   HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
@@ -466,7 +465,7 @@
     map_var_info_t *arg_names, void **arg_mappers, int32_t team_num,
     int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (depNum + noAliasDepNum > 0)
     __kmpc_omp_taskwait(loc, __kmpc_global_thread_num(loc));
 
@@ -502,7 +501,7 @@
 
 EXTERN void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id,
                                          uint64_t loop_tripcount) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled())
     return;
 
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index 8cb16a4..85e3cf6 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -51,7 +51,7 @@
 static const int64_t Alignment = 8;
 
 /// Map global data and execute pending ctors
-static int InitLibrary(DeviceTy& Device) {
+static int InitLibrary(DeviceTy &Device) {
   /*
    * Map global data
    */
@@ -84,8 +84,8 @@
       break;
     }
     // 2) load image into the target table.
-    __tgt_target_table *TargetTable =
-        TransTable->TargetsTable[device_id] = Device.load_binary(img);
+    __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
+        Device.load_binary(img);
     // Unable to get table for this image: invalidate image and fail.
     if (!TargetTable) {
       REPORT("Unable to generate entries table for device id %d.\n", device_id);
@@ -129,8 +129,9 @@
         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
           continue;
         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
-            "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
-            CurrDeviceEntry->size);
+           "\n",
+           DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
+           CurrDeviceEntry->size);
         Device.HostDataToTargetMap.emplace(
             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
@@ -158,8 +159,9 @@
         DP("Has pending ctors... call now\n");
         for (auto &entry : lib.second.PendingCtors) {
           void *ctor = entry;
-          int rc = target(device_id, ctor, 0, nullptr, nullptr, nullptr,
-                          nullptr, nullptr, nullptr, 1, 1, true /*team*/);
+          int rc =
+              target(nullptr, device_id, ctor, 0, nullptr, nullptr, nullptr,
+                     nullptr, nullptr, nullptr, 1, 1, true /*team*/);
           if (rc != OFFLOAD_SUCCESS) {
             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
             Device.PendingGlobalsMtx.unlock();
@@ -208,10 +210,11 @@
 
 /// Call the user-defined mapper function followed by the appropriate
 // target_data_* function (target_data_{begin,end,update}).
-int targetDataMapper(DeviceTy &Device, void *arg_base, void *arg,
+int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
                      int64_t arg_size, int64_t arg_type,
                      map_var_info_t arg_names, void *arg_mapper,
                      TargetDataFuncPtrTy target_data_function) {
+  TIMESCOPE_WITH_IDENT(loc);
   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
 
   // The mapper function fills up Components.
@@ -240,7 +243,7 @@
     MapperArgNames[I] = C.Name;
   }
 
-  int rc = target_data_function(Device, MapperComponents.Components.size(),
+  int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
                                 MapperArgsBase.data(), MapperArgs.data(),
                                 MapperArgSizes.data(), MapperArgTypes.data(),
                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
@@ -250,10 +253,10 @@
 }
 
 /// Internal function to do the mapping and transfer the data to the device
-int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
-                    void **args, int64_t *arg_sizes, int64_t *arg_types,
-                    map_var_info_t *arg_names, void **arg_mappers,
-                    __tgt_async_info *async_info_ptr) {
+int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
+                    void **args_base, void **args, int64_t *arg_sizes,
+                    int64_t *arg_types, map_var_info_t *arg_names,
+                    void **arg_mappers, __tgt_async_info *async_info_ptr) {
   // process each input.
   for (int32_t i = 0; i < arg_num; ++i) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -268,7 +271,7 @@
       DP("Calling targetDataMapper for the %dth argument\n", i);
 
       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
-      int rc = targetDataMapper(Device, args_base[i], args[i], arg_sizes[i],
+      int rc = targetDataMapper(loc, Device, args_base[i], args[i], arg_sizes[i],
                                 arg_types[i], arg_name, arg_mappers[i],
                                 targetDataBegin);
 
@@ -291,14 +294,15 @@
     // Look at the next argument - if that is MEMBER_OF this one, then this one
     // is a combined entry.
     int64_t padding = 0;
-    const int next_i = i+1;
+    const int next_i = i + 1;
     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
         getParentIndex(arg_types[next_i]) == i) {
       padding = (int64_t)HstPtrBegin % Alignment;
       if (padding) {
         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
-            "\n", padding, DPxPTR(HstPtrBegin));
-        HstPtrBegin = (char *) HstPtrBegin - padding;
+           "\n",
+           padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *)HstPtrBegin - padding;
         data_size += padding;
       }
     }
@@ -344,8 +348,9 @@
         return OFFLOAD_FAIL;
       }
       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
-          "\n", sizeof(void *), DPxPTR(PointerTgtPtrBegin),
-          (Pointer_IsNew ? "" : " not"));
+         "\n",
+         sizeof(void *), DPxPTR(PointerTgtPtrBegin),
+         (Pointer_IsNew ? "" : " not"));
       Pointer_HstPtrBegin = HstPtrBase;
       // modify current entry.
       HstPtrBase = *(void **)HstPtrBase;
@@ -364,8 +369,8 @@
       return OFFLOAD_FAIL;
     }
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
-        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
-        (IsNew ? "" : " not"));
+       " - is%s new\n",
+       data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not"));
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
@@ -449,10 +454,10 @@
 } // namespace
 
 /// Internal function to undo the mapping and retrieve the data from the device.
-int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
-                  void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-                  map_var_info_t *ArgNames, void **ArgMappers,
-                  __tgt_async_info *AsyncInfo) {
+int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
+                  void **ArgBases, void **Args, int64_t *ArgSizes,
+                  int64_t *ArgTypes, map_var_info_t *ArgNames,
+                  void **ArgMappers, __tgt_async_info *AsyncInfo) {
   int Ret;
   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
   // process each input.
@@ -471,7 +476,7 @@
 
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
       Ret =
-          targetDataMapper(Device, ArgBases[I], Args[I], ArgSizes[I],
+          targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
                            ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
 
       if (Ret != OFFLOAD_SUCCESS) {
@@ -646,9 +651,10 @@
   return OFFLOAD_SUCCESS;
 }
 
-static int targetDataContiguous(DeviceTy &Device, void *ArgsBase,
+static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
                                 void *HstPtrBegin, int64_t ArgSize,
                                 int64_t ArgType) {
+  TIMESCOPE_WITH_IDENT(loc);
   bool IsLast, IsHostPtr;
   void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
                                             IsHostPtr, /*MustContain=*/true);
@@ -732,11 +738,13 @@
   return OFFLOAD_SUCCESS;
 }
 
-static int targetDataNonContiguous(DeviceTy &Device, void *ArgsBase,
+static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
+                                   void *ArgsBase,
                                    __tgt_target_non_contig *NonContig,
                                    uint64_t Size, int64_t ArgType,
                                    int CurrentDim, int DimSize,
                                    uint64_t Offset) {
+  TIMESCOPE_WITH_IDENT(loc);
   int Ret = OFFLOAD_SUCCESS;
   if (CurrentDim < DimSize) {
     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
@@ -745,7 +753,7 @@
       // we only need to transfer the first element for the last dimension
       // since we've already got a contiguous piece.
       if (CurrentDim != DimSize - 1 || I == 0) {
-        Ret = targetDataNonContiguous(Device, ArgsBase, NonContig, Size,
+        Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
                                       ArgType, CurrentDim + 1, DimSize,
                                       Offset + CurOffset);
         // Stop the whole process if any contiguous piece returns anything
@@ -758,7 +766,7 @@
     char *Ptr = (char *)ArgsBase + Offset;
     DP("Transfer of non-contiguous : host ptr %lx offset %ld len %ld\n",
        (uint64_t)Ptr, Offset, Size);
-    Ret = targetDataContiguous(Device, ArgsBase, Ptr, Size, ArgType);
+    Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType);
   }
   return Ret;
 }
@@ -776,10 +784,10 @@
 /// Internal function to pass data to/from the target.
 // async_info_ptr is currently unused, added here so targetDataUpdate has the
 // same signature as targetDataBegin and targetDataEnd.
-int targetDataUpdate(DeviceTy &Device, int32_t ArgNum, void **ArgsBase,
-                     void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-                     map_var_info_t *ArgNames, void **ArgMappers,
-                     __tgt_async_info *AsyncInfoPtr) {
+int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
+                     void **ArgsBase, void **Args, int64_t *ArgSizes,
+                     int64_t *ArgTypes, map_var_info_t *ArgNames,
+                     void **ArgMappers, __tgt_async_info *AsyncInfoPtr) {
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -793,7 +801,7 @@
       DP("Calling targetDataMapper for the %dth argument\n", I);
 
       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
-      int Ret = targetDataMapper(Device, ArgsBase[I], Args[I], ArgSizes[I],
+      int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
                                  ArgTypes[I], ArgName, ArgMappers[I],
                                  targetDataUpdate);
 
@@ -816,10 +824,10 @@
           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
       Ret = targetDataNonContiguous(
-          Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
+          loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
     } else {
-      Ret = targetDataContiguous(Device, ArgsBase[I], Args[I], ArgSizes[I],
+      Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
                                  ArgTypes[I]);
     }
     if (Ret == OFFLOAD_FAIL)
@@ -1063,16 +1071,18 @@
 /// Process data before launching the kernel, including calling targetDataBegin
 /// to map and transfer data to target device, transferring (first-)private
 /// variables.
-int processDataBefore(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-                      void **ArgBases, void **Args, int64_t *ArgSizes,
-                      int64_t *ArgTypes, map_var_info_t *ArgNames,
-                      void **ArgMappers, std::vector<void *> &TgtArgs,
-                      std::vector<ptrdiff_t> &TgtOffsets,
-                      PrivateArgumentManagerTy &PrivateArgumentManager,
-                      __tgt_async_info *AsyncInfo) {
+static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
+                             int32_t ArgNum, void **ArgBases, void **Args,
+                             int64_t *ArgSizes, int64_t *ArgTypes,
+                             map_var_info_t *ArgNames, void **ArgMappers,
+                             std::vector<void *> &TgtArgs,
+                             std::vector<ptrdiff_t> &TgtOffsets,
+                             PrivateArgumentManagerTy &PrivateArgumentManager,
+                             __tgt_async_info *AsyncInfo) {
+  TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
   DeviceTy &Device = PM->Devices[DeviceId];
-  int Ret = targetDataBegin(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
-                            ArgNames, ArgMappers, AsyncInfo);
+  int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
+                            ArgTypes, ArgNames, ArgMappers, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT("Call to targetDataBegin failed, abort target.\n");
     return OFFLOAD_FAIL;
@@ -1184,17 +1194,18 @@
 
 /// Process data after launching the kernel, including transferring data back to
 /// host if needed and deallocating target memory of (first-)private variables.
-int processDataAfter(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
-                     void **ArgBases, void **Args, int64_t *ArgSizes,
-                     int64_t *ArgTypes, map_var_info_t *ArgNames,
-                     void **ArgMappers,
-                     PrivateArgumentManagerTy &PrivateArgumentManager,
-                     __tgt_async_info *AsyncInfo) {
+static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
+                            int32_t ArgNum, void **ArgBases, void **Args,
+                            int64_t *ArgSizes, int64_t *ArgTypes,
+                            map_var_info_t *ArgNames, void **ArgMappers,
+                            PrivateArgumentManagerTy &PrivateArgumentManager,
+                            __tgt_async_info *AsyncInfo) {
+  TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
   DeviceTy &Device = PM->Devices[DeviceId];
 
   // Move data from device.
-  int Ret = targetDataEnd(Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes,
-                          ArgNames, ArgMappers, AsyncInfo);
+  int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
+                          ArgTypes, ArgNames, ArgMappers, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT("Call to targetDataEnd failed, abort target.\n");
     return OFFLOAD_FAIL;
@@ -1217,8 +1228,8 @@
 /// performs the same action as data_update and data_end above. This function
 /// returns 0 if it was able to transfer the execution to a target and an
 /// integer different from zero otherwise.
-int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum, void **ArgBases,
-           void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
+int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+           void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
            int32_t ThreadLimit, int IsTeamConstruct) {
   DeviceTy &Device = PM->Devices[DeviceId];
@@ -1248,13 +1259,16 @@
 
   PrivateArgumentManagerTy PrivateArgumentManager(Device, &AsyncInfo);
 
-  // Process data, such as data mapping, before launching the kernel
-  int Ret = processDataBefore(DeviceId, HostPtr, ArgNum, ArgBases, Args,
-                              ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
-                              TgtOffsets, PrivateArgumentManager, &AsyncInfo);
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Failed to process data before launching the kernel.\n");
-    return OFFLOAD_FAIL;
+  int Ret;
+  if (ArgNum) {
+    // Process data, such as data mapping, before launching the kernel
+    Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
+                            ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
+                            TgtOffsets, PrivateArgumentManager, &AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Failed to process data before launching the kernel.\n");
+      return OFFLOAD_FAIL;
+    }
   }
 
   // Get loop trip count
@@ -1265,27 +1279,33 @@
   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
 
-  if (IsTeamConstruct)
-    Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
-                               TgtArgs.size(), TeamNum, ThreadLimit,
-                               LoopTripCount, &AsyncInfo);
-  else
-    Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
-                           TgtArgs.size(), &AsyncInfo);
+  {
+    TIMESCOPE_WITH_NAME_AND_IDENT(
+        IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
+    if (IsTeamConstruct)
+      Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                                 TgtArgs.size(), TeamNum, ThreadLimit,
+                                 LoopTripCount, &AsyncInfo);
+    else
+      Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
+                             TgtArgs.size(), &AsyncInfo);
+  }
 
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT("Executing target region abort target.\n");
     return OFFLOAD_FAIL;
   }
 
-  // Transfer data back and deallocate target memory for (first-)private
-  // variables
-  Ret = processDataAfter(DeviceId, HostPtr, ArgNum, ArgBases, Args, ArgSizes,
-                         ArgTypes, ArgNames, ArgMappers, PrivateArgumentManager,
-                         &AsyncInfo);
-  if (Ret != OFFLOAD_SUCCESS) {
-    REPORT("Failed to process data after launching the kernel.\n");
-    return OFFLOAD_FAIL;
+  if (ArgNum) {
+    // Transfer data back and deallocate target memory for (first-)private
+    // variables
+    Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
+                           ArgSizes, ArgTypes, ArgNames, ArgMappers,
+                           PrivateArgumentManager, &AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS) {
+      REPORT("Failed to process data after launching the kernel.\n");
+      return OFFLOAD_FAIL;
+    }
   }
 
   return OFFLOAD_SUCCESS;
diff --git a/libomptarget/src/private.h b/libomptarget/src/private.h
index 6a44dd9..340ed23 100644
--- a/libomptarget/src/private.h
+++ b/libomptarget/src/private.h
@@ -19,22 +19,24 @@
 
 #include <cstdint>
 
-extern int targetDataBegin(DeviceTy &Device, int32_t arg_num, void **args_base,
-                           void **args, int64_t *arg_sizes, int64_t *arg_types,
-                           map_var_info_t *arg_names, void **arg_mappers,
+extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
+                           void **args_base, void **args, int64_t *arg_sizes,
+                           int64_t *arg_types, map_var_info_t *arg_names,
+                           void **arg_mappers,
                            __tgt_async_info *async_info_ptr);
 
-extern int targetDataEnd(DeviceTy &Device, int32_t ArgNum, void **ArgBases,
-                         void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
-                         map_var_info_t *arg_names, void **ArgMappers,
-                         __tgt_async_info *AsyncInfo);
+extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
+                         void **ArgBases, void **Args, int64_t *ArgSizes,
+                         int64_t *ArgTypes, map_var_info_t *arg_names,
+                         void **ArgMappers, __tgt_async_info *AsyncInfo);
 
-extern int targetDataUpdate(DeviceTy &Device, int32_t arg_num, void **args_base,
-                            void **args, int64_t *arg_sizes, int64_t *arg_types,
-                            map_var_info_t *arg_names, void **arg_mappers,
+extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
+                            void **args_base, void **args, int64_t *arg_sizes,
+                            int64_t *arg_types, map_var_info_t *arg_names,
+                            void **arg_mappers,
                             __tgt_async_info *async_info_ptr = nullptr);
 
-extern int target(int64_t DeviceId, void *HostPtr, int32_t ArgNum,
+extern int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *arg_names,
                   void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
@@ -73,9 +75,10 @@
 
 // Function pointer type for target_data_* functions (targetDataBegin,
 // targetDataEnd and targetDataUpdate).
-typedef int (*TargetDataFuncPtrTy)(DeviceTy &, int32_t, void **, void **,
-                                   int64_t *, int64_t *, map_var_info_t *,
-                                   void **, __tgt_async_info *);
+typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
+                                   void **, int64_t *, int64_t *,
+                                   map_var_info_t *, void **,
+                                   __tgt_async_info *);
 
 // Implemented in libomp, they are called from within __tgt_* functions.
 #ifdef __cplusplus
@@ -157,8 +160,16 @@
 #ifdef OMPTARGET_PROFILE_ENABLED
 #include "llvm/Support/TimeProfiler.h"
 #define TIMESCOPE() llvm::TimeTraceScope TimeScope(__FUNCTION__)
+#define TIMESCOPE_WITH_IDENT(IDENT)                                            \
+  SourceInfo SI(IDENT);                                                        \
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, SI.getProfileLocation())
+#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)                             \
+  SourceInfo SI(IDENT);                                                        \
+  llvm::TimeTraceScope TimeScope(NAME, SI.getProfileLocation())
 #else
 #define TIMESCOPE()
+#define TIMESCOPE_WITH_IDENT(IDENT)
+#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME IDENT)
 #endif
 
 #endif
diff --git a/libomptarget/src/rtl.cpp b/libomptarget/src/rtl.cpp
index 443359a..4a2a6d9 100644
--- a/libomptarget/src/rtl.cpp
+++ b/libomptarget/src/rtl.cpp
@@ -396,8 +396,9 @@
         Device.PendingGlobalsMtx.lock();
         if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
           for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
-            int rc = target(Device.DeviceID, dtor, 0, nullptr, nullptr, nullptr,
-                            nullptr, nullptr, nullptr, 1, 1, true /*team*/);
+            int rc =
+                target(nullptr, Device.DeviceID, dtor, 0, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr, 1, 1, true /*team*/);
             if (rc != OFFLOAD_SUCCESS) {
               DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
             }