[OpenMP] Use function tracing RAII for runtime functions.

This patch adds support for using function tracing features to track the
executino of runtime functions in the device runtime library. This is
enabled by first compiling the new runtime with
`-fopenmp-target-debug=3` and running with
`LIBOMPTARGET_DEVICE_RTL_DEBUG=3`. The output only tracks team 0 and
thread 0 so there isn't much output when using a generic region.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D112002

GitOrigin-RevId: 74f91741b66b9327fdbae6411286672ec088c3a3
diff --git a/libomptarget/DeviceRTL/include/Debug.h b/libomptarget/DeviceRTL/include/Debug.h
index e8e9078..6aa801d 100644
--- a/libomptarget/DeviceRTL/include/Debug.h
+++ b/libomptarget/DeviceRTL/include/Debug.h
@@ -49,13 +49,13 @@
 /// Enter a debugging scope for performing function traces. Enabled with
 /// FunctionTracting set in the debug kind.
 #define FunctionTracingRAII()                                                  \
-  DebugEntryRAII Entry(__LINE__, __PRETTY_FUNCTION__);
+  DebugEntryRAII Entry(__FILE__, __LINE__, __PRETTY_FUNCTION__);
 
 /// An RAII class for handling entries to debug locations. The current location
 /// and function will be printed on entry. Nested levels increase the
 /// indentation shown in the debugging output.
 struct DebugEntryRAII {
-  DebugEntryRAII(const unsigned Line, const char *Function);
+  DebugEntryRAII(const char *File, const unsigned Line, const char *Function);
   ~DebugEntryRAII();
 };
 
diff --git a/libomptarget/DeviceRTL/src/Debug.cpp b/libomptarget/DeviceRTL/src/Debug.cpp
index 6fa6843..2f0e608 100644
--- a/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/libomptarget/DeviceRTL/src/Debug.cpp
@@ -12,6 +12,7 @@
 
 #include "Debug.h"
 #include "Configuration.h"
+#include "Interface.h"
 #include "Mapping.h"
 #include "Types.h"
 
@@ -41,14 +42,15 @@
 static uint32_t Level = 0;
 #pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
 
-DebugEntryRAII::DebugEntryRAII(const unsigned Line, const char *Function) {
+DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
+                               const char *Function) {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0) {
+      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
 
     for (int I = 0; I < Level; ++I)
       PRINTF("%s", "  ");
 
-    PRINTF("Line %u: Thread %u Entering %s:%u\n", Line,
+    PRINTF("%s:%u: Thread %u Entering %s\n", File, Line,
            mapping::getThreadIdInBlock(), Function);
     Level++;
   }
@@ -56,7 +58,7 @@
 
 DebugEntryRAII::~DebugEntryRAII() {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0)
+      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0)
     Level--;
 }
 
diff --git a/libomptarget/DeviceRTL/src/Kernel.cpp b/libomptarget/DeviceRTL/src/Kernel.cpp
index d47fa03..94bf432 100644
--- a/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -30,6 +30,7 @@
 
 /// Simple generic state machine for worker threads.
 static void genericStateMachine(IdentTy *Ident) {
+  FunctionTracingRAII();
 
   uint32_t TId = mapping::getThreadIdInBlock();
 
@@ -66,6 +67,7 @@
 ///
 int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
                            bool UseGenericStateMachine, bool) {
+  FunctionTracingRAII();
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
@@ -98,6 +100,7 @@
 /// \param Ident Source location identification, can be NULL.
 ///
 void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) {
+  FunctionTracingRAII();
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   state::assumeInitialState(IsSPMD);
   if (IsSPMD)
@@ -107,7 +110,10 @@
   state::ParallelRegionFn = nullptr;
 }
 
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+int8_t __kmpc_is_spmd_exec_mode() {
+  FunctionTracingRAII();
+  return mapping::isSPMDMode();
+}
 }
 
 #pragma omp end declare target
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index 740cc7b..9bd26c8 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -231,10 +231,12 @@
 
 extern "C" {
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
+  FunctionTracingRAII();
   return mapping::getThreadIdInBlock();
 }
 
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
+  FunctionTracingRAII();
   return mapping::getNumberOfProcessorElements();
 }
 }
diff --git a/libomptarget/DeviceRTL/src/Misc.cpp b/libomptarget/DeviceRTL/src/Misc.cpp
index 44fb85b..7284be8 100644
--- a/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/libomptarget/DeviceRTL/src/Misc.cpp
@@ -11,6 +11,8 @@
 
 #include "Types.h"
 
+#include "Debug.h"
+
 #pragma omp declare target
 
 namespace _OMP {
@@ -60,9 +62,15 @@
 ///{
 
 extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) {
+  FunctionTracingRAII();
+  return 0;
+}
 
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) {
+  FunctionTracingRAII();
+  return 0;
+}
 
 double omp_get_wtick(void) { return _OMP::impl::getWTick(); }
 
diff --git a/libomptarget/DeviceRTL/src/Parallelism.cpp b/libomptarget/DeviceRTL/src/Parallelism.cpp
index e80f046..8dcda21 100644
--- a/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -66,6 +66,7 @@
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
                      void **args, int64_t nargs) {
+  DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
   switch (nargs) {
 #include "generated_microtask_cases.gen"
   default:
@@ -81,6 +82,7 @@
 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                         int32_t num_threads, int proc_bind, void *fn,
                         void *wrapper_fn, void **args, int64_t nargs) {
+  FunctionTracingRAII();
 
   uint32_t TId = mapping::getThreadIdInBlock();
   // Handle the serialized case first, same for SPMD/non-SPMD.
@@ -171,6 +173,7 @@
 
 __attribute__((noinline)) bool
 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+  FunctionTracingRAII();
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;
 
@@ -185,6 +188,7 @@
 }
 
 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
+  FunctionTracingRAII();
   // In case we have modified an ICV for this thread before a ThreadState was
   // created. We drop it now to not contaminate the next parallel region.
   ASSERT(!mapping::isSPMDMode());
@@ -193,18 +197,29 @@
   ASSERT(!mapping::isSPMDMode());
 }
 
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
+  FunctionTracingRAII();
+  return omp_get_level();
+}
 
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+int32_t __kmpc_global_thread_num(IdentTy *) {
+  FunctionTracingRAII();
+  return omp_get_thread_num();
+}
 
 void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
+  FunctionTracingRAII();
   icv::NThreads = NumThreads;
 }
 
 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
-                           int32_t thread_limit) {}
+                           int32_t thread_limit) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
+  FunctionTracingRAII();
+}
 }
 
 #pragma omp end declare target
diff --git a/libomptarget/DeviceRTL/src/Reduction.cpp b/libomptarget/DeviceRTL/src/Reduction.cpp
index 05efe95..dd1d30d 100644
--- a/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -176,6 +176,7 @@
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
+  FunctionTracingRAII();
   return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
                                       shflFct, cpyFct, mapping::isSPMDMode(),
                                       false);
@@ -186,6 +187,7 @@
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
     ListGlobalFnTy glredFct) {
+  FunctionTracingRAII();
 
   // Terminate all threads in non-SPMD mode except for the master thread.
   uint32_t ThreadId = mapping::getThreadIdInBlock();
@@ -310,9 +312,9 @@
   return 0;
 }
 
-void __kmpc_nvptx_end_reduce(int32_t TId) {}
+void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
 }
 
 #pragma omp end declare target
diff --git a/libomptarget/DeviceRTL/src/State.cpp b/libomptarget/DeviceRTL/src/State.cpp
index f39b61c..54a191c 100644
--- a/libomptarget/DeviceRTL/src/State.cpp
+++ b/libomptarget/DeviceRTL/src/State.cpp
@@ -498,10 +498,12 @@
 
 extern "C" {
 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+  FunctionTracingRAII();
   return memory::allocShared(Bytes, "Frontend alloc shared");
 }
 
 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+  FunctionTracingRAII();
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
@@ -523,6 +525,7 @@
     allocator(omp_pteam_mem_alloc)
 
 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
+  FunctionTracingRAII();
   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
   } else {
@@ -533,11 +536,13 @@
 }
 
 void __kmpc_end_sharing_variables() {
+  FunctionTracingRAII();
   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
 }
 
 void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  FunctionTracingRAII();
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 }
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index 931dffc..e219c75 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -321,16 +321,18 @@
 }
 
 extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
 int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   __kmpc_barrier(Loc, TId);
   return 0;
 }
 
 void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   if (mapping::isMainThreadInGenericMode())
     return __kmpc_flush(Loc);
 
@@ -342,34 +344,49 @@
 
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
+  FunctionTracingRAII();
   synchronize::threadsAligned();
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   return omp_get_team_num() == 0;
 }
 
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+void __kmpc_end_master(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
 int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   return __kmpc_master(Loc, TId);
 }
 
 void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   // The barrier is explicitly called.
 }
 
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
+void __kmpc_flush(IdentTy *Loc) {
+  FunctionTracingRAII();
+  fence::kernel(__ATOMIC_SEQ_CST);
+}
 
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
+uint64_t __kmpc_warp_active_thread_mask(void) {
+  FunctionTracingRAII();
+  return mapping::activemask();
+}
 
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
+void __kmpc_syncwarp(uint64_t Mask) {
+  FunctionTracingRAII();
+  synchronize::warp(Mask);
+}
 
 void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  FunctionTracingRAII();
   omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
 }
 
 void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  FunctionTracingRAII();
   omp_unset_lock(reinterpret_cast<omp_lock_t *>(Name));
 }
 
diff --git a/libomptarget/DeviceRTL/src/Tasking.cpp b/libomptarget/DeviceRTL/src/Tasking.cpp
index 6b6991e..0416395 100644
--- a/libomptarget/DeviceRTL/src/Tasking.cpp
+++ b/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -26,6 +26,7 @@
                                         uint64_t TaskSizeInclPrivateValues,
                                         uint64_t SharedValuesSize,
                                         TaskFnTy TaskFn) {
+  FunctionTracingRAII();
   auto TaskSizeInclPrivateValuesPadded =
       utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
   auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
@@ -40,12 +41,14 @@
 
 int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
                         TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
 }
 
 int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
                                   TaskDescriptorTy *TaskDescriptor, int32_t,
                                   void *, int32_t, void *) {
+  FunctionTracingRAII();
   state::DateEnvironmentRAII DERAII;
 
   TaskDescriptor->TaskFn(0, TaskDescriptor);
@@ -56,31 +59,42 @@
 
 void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
                                TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   state::enterDataEnvironment();
 }
 
 void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
                                   TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   state::exitDataEnvironment();
 
   memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
 }
 
 void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *) {}
+                          void *) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) { FunctionTracingRAII(); }
 
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) {
+  FunctionTracingRAII();
+  return 0;
+}
 
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) {
+  FunctionTracingRAII();
+  return 0;
+}
 
 void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
                      TaskDescriptorTy *TaskDescriptor, int,
                      uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
                      int32_t, uint64_t, void *) {
+  FunctionTracingRAII();
   // Skip task entirely if empty iteration space.
   if (*LowerBound > *UpperBound)
     return;
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index 3f65f21..8fcb96b 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -11,6 +11,7 @@
 
 #include "Utils.h"
 
+#include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
 
@@ -129,10 +130,12 @@
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+  FunctionTracingRAII();
   return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
 }
 
 int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+  FunctionTracingRAII();
   uint32_t lo, hi;
   utils::unpack(Val, lo, hi);
   hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
diff --git a/libomptarget/DeviceRTL/src/Workshare.cpp b/libomptarget/DeviceRTL/src/Workshare.cpp
index 89c10b1..24f3fee 100644
--- a/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -470,6 +470,7 @@
 // init
 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -478,6 +479,7 @@
 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
                              uint32_t lb, uint32_t ub, int32_t st,
                              int32_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -485,6 +487,7 @@
 
 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -493,6 +496,7 @@
 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
                              uint64_t lb, uint64_t ub, int64_t st,
                              int64_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -501,6 +505,7 @@
 // next
 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -508,6 +513,7 @@
 
 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -515,6 +521,7 @@
 
 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -522,6 +529,7 @@
 
 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -529,21 +537,25 @@
 
 // fini
 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
   popDST();
 }
@@ -556,6 +568,7 @@
                               int32_t schedtype, int32_t *plastiter,
                               int32_t *plower, int32_t *pupper,
                               int32_t *pstride, int32_t incr, int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -565,6 +578,7 @@
                                int32_t schedtype, int32_t *plastiter,
                                uint32_t *plower, uint32_t *pupper,
                                int32_t *pstride, int32_t incr, int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -574,6 +588,7 @@
                               int32_t schedtype, int32_t *plastiter,
                               int64_t *plower, int64_t *pupper,
                               int64_t *pstride, int64_t incr, int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -583,6 +598,7 @@
                                int32_t schedtype, int32_t *plastiter,
                                uint64_t *plower, uint64_t *pupper,
                                int64_t *pstride, int64_t incr, int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -593,6 +609,7 @@
                                      int32_t *plower, int32_t *pupper,
                                      int32_t *pstride, int32_t incr,
                                      int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -603,6 +620,7 @@
                                       uint32_t *plower, uint32_t *pupper,
                                       int32_t *pstride, int32_t incr,
                                       int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -613,6 +631,7 @@
                                      int64_t *plower, int64_t *pupper,
                                      int64_t *pstride, int64_t incr,
                                      int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -623,14 +642,19 @@
                                       uint64_t *plower, uint64_t *pupper,
                                       int64_t *pstride, int64_t incr,
                                       int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
 }
 
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
+void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
+  FunctionTracingRAII();
+}
 }
 
 #pragma omp end declare target