tools/archer/ompt-tsan.cpp - llvm-project/openmp - Git at Google

 /*
  * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
  */

 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif

 #include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 #include <inttypes.h>
 #include <iostream>
 #include <list>
 #include <mutex>
 #include <sstream>
 #include <stack>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #if (defined __APPLE__ && defined __MACH__)
 #include <dlfcn.h>
 #endif

 #include "omp-tools.h"
 #include <sys/resource.h>

 // Define attribute that indicates that the fall through from the previous
 // case label is intentional and should not be diagnosed by a compiler
 //   Code from libcxx/include/__config
 // Use a function like macro to imply that it must be followed by a semicolon
 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
 #define KMP_FALLTHROUGH() [[fallthrough]]
 #elif __has_cpp_attribute(clang::fallthrough)
 #define KMP_FALLTHROUGH() [[clang::fallthrough]]
 #elif __has_attribute(fallthrough) || __GNUC__ >= 7
 #define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
 #else
 #define KMP_FALLTHROUGH() ((void)0)
 #endif

 static int runOnTsan;
 static int hasReductionCallback;

 class ArcherFlags {
 public:
 #if (LLVM_VERSION) >= 40
   int flush_shadow{0};
 #endif
   int print_max_rss{0};
   int verbose{0};
   int enabled{1};
   int ignore_serial{0};

   ArcherFlags(const char *env) {
     if (env) {
       std::vector<std::string> tokens;
       std::string token;
       std::string str(env);
       std::istringstream iss(str);
       while (std::getline(iss, token, ' '))
         tokens.push_back(token);

       for (std::vector<std::string>::iterator it = tokens.begin();
            it != tokens.end(); ++it) {
 #if (LLVM_VERSION) >= 40
         if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
           continue;
 #endif
         if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
           continue;
         if (sscanf(it->c_str(), "verbose=%d", &verbose))
           continue;
         if (sscanf(it->c_str(), "enable=%d", &enabled))
           continue;
         if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
           continue;
         std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
                   << std::endl;
       }
     }
   }
 };

 class TsanFlags {
 public:
   int ignore_noninstrumented_modules;

   TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
     if (env) {
       std::vector<std::string> tokens;
       std::string str(env);
       auto end = str.end();
       auto it = str.begin();
       auto is_sep = [](char c) {
         return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
                c == '\r';
       };
       while (it != end) {
         auto next_it = std::find_if(it, end, is_sep);
         tokens.emplace_back(it, next_it);
         it = next_it;
         if (it != end) {
           ++it;
         }
       }

       for (const auto &token : tokens) {
         // we are interested in ignore_noninstrumented_modules to print a
         // warning
         if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
                    &ignore_noninstrumented_modules))
           continue;
       }
     }
   }
 };

 #if (LLVM_VERSION) >= 40
 extern "C" {
 int __attribute__((weak)) __archer_get_omp_status();
 void __attribute__((weak)) __tsan_flush_memory() {}
 }
 #endif
 ArcherFlags *archer_flags;

 // The following definitions are pasted from "llvm/Support/Compiler.h" to allow
 // the code
 // to be compiled with other compilers like gcc:

 #ifndef TsanHappensBefore
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
 extern "C" {
 #if (defined __APPLE__ && defined __MACH__)
 static void AnnotateHappensAfter(const char *file, int line,
                                  const volatile void *cv) {
   void (*fptr)(const char *, int, const volatile void *);

   fptr = (void (*)(const char *, int, const volatile void *))dlsym(
       RTLD_DEFAULT, "AnnotateHappensAfter");
   (*fptr)(file, line, cv);
 }
 static void AnnotateHappensBefore(const char *file, int line,
                                   const volatile void *cv) {
   void (*fptr)(const char *, int, const volatile void *);

   fptr = (void (*)(const char *, int, const volatile void *))dlsym(
       RTLD_DEFAULT, "AnnotateHappensBefore");
   (*fptr)(file, line, cv);
 }
 static void AnnotateIgnoreWritesBegin(const char *file, int line) {
   void (*fptr)(const char *, int);

   fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
                                             "AnnotateIgnoreWritesBegin");
   (*fptr)(file, line);
 }
 static void AnnotateIgnoreWritesEnd(const char *file, int line) {
   void (*fptr)(const char *, int);

   fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
                                             "AnnotateIgnoreWritesEnd");
   (*fptr)(file, line);
 }
 static void AnnotateNewMemory(const char *file, int line,
                               const volatile void *cv, size_t size) {
   void (*fptr)(const char *, int, const volatile void *, size_t);

   fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
       RTLD_DEFAULT, "AnnotateNewMemory");
   (*fptr)(file, line, cv, size);
 }
 static int RunningOnValgrind() {
   int (*fptr)();

   fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
   if (fptr && fptr != RunningOnValgrind)
     runOnTsan = 0;
   return 0;
 }
 #else
 void __attribute__((weak))
 AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
 void __attribute__((weak))
 AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
 void __attribute__((weak))
 AnnotateIgnoreWritesBegin(const char *file, int line) {}
 void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
 }
 void __attribute__((weak))
 AnnotateNewMemory(const char *file, int line, const volatile void *cv,
                   size_t size) {}
 int __attribute__((weak)) RunningOnValgrind() {
   runOnTsan = 0;
   return 0;
 }
 void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
 void __attribute__((weak)) __tsan_func_exit(void) {}
 #endif
 }

 // This marker is used to define a happens-before arc. The race detector will
 // infer an arc from the begin to the end when they share the same pointer
 // argument.
 #define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)

 // This marker defines the destination of a happens-before arc.
 #define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)

 // Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
 #define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)

 // Resume checking for racy writes.
 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)

 // We don't really delete the clock for now
 #define TsanDeleteClock(cv)

 // newMemory
 #define TsanNewMemory(addr, size)                                              \
   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
 #define TsanFreeMemory(addr, size)                                             \
   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
 #endif

 // Function entry/exit
 #define TsanFuncEntry(pc) __tsan_func_entry(pc)
 #define TsanFuncExit() __tsan_func_exit()

 /// Required OMPT inquiry functions.
 static ompt_get_parallel_info_t ompt_get_parallel_info;
 static ompt_get_thread_data_t ompt_get_thread_data;

 typedef uint64_t ompt_tsan_clockid;

 static uint64_t my_next_id() {
   static uint64_t ID = 0;
   uint64_t ret = __sync_fetch_and_add(&ID, 1);
   return ret;
 }

 // Data structure to provide a threadsafe pool of reusable objects.
 // DataPool<Type of objects, Size of blockalloc>
 template <typename T, int N> struct DataPool {
   std::mutex DPMutex;
   std::stack<T *> DataPointer;
   std::list<void *> memory;
   int total;

   void newDatas() {
     // prefix the Data with a pointer to 'this', allows to return memory to
     // 'this',
     // without explicitly knowing the source.
     //
     // To reduce lock contention, we use thread local DataPools, but Data
     // objects move to other threads.
     // The strategy is to get objects from local pool. Only if the object moved
     // to another
     // thread, we might see a penalty on release (returnData).
     // For "single producer" pattern, a single thread creates tasks, these are
     // executed by other threads.
     // The master will have a high demand on TaskData, so return after use.
     struct pooldata {
       DataPool<T, N> *dp;
       T data;
     };
     // We alloc without initialize the memory. We cannot call constructors.
     // Therefore use malloc!
     pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
     memory.push_back(datas);
     for (int i = 0; i < N; i++) {
       datas[i].dp = this;
       DataPointer.push(&(datas[i].data));
     }
     total += N;
   }

   T *getData() {
     T *ret;
     DPMutex.lock();
     if (DataPointer.empty())
       newDatas();
     ret = DataPointer.top();
     DataPointer.pop();
     DPMutex.unlock();
     return ret;
   }

   void returnData(T *data) {
     DPMutex.lock();
     DataPointer.push(data);
     DPMutex.unlock();
   }

   void getDatas(int n, T **datas) {
     DPMutex.lock();
     for (int i = 0; i < n; i++) {
       if (DataPointer.empty())
         newDatas();
       datas[i] = DataPointer.top();
       DataPointer.pop();
     }
     DPMutex.unlock();
   }

   void returnDatas(int n, T **datas) {
     DPMutex.lock();
     for (int i = 0; i < n; i++) {
       DataPointer.push(datas[i]);
     }
     DPMutex.unlock();
   }

   DataPool() : DPMutex(), DataPointer(), total(0) {}

   ~DataPool() {
     // we assume all memory is returned when the thread finished / destructor is
     // called
     for (auto i : memory)
       if (i)
         free(i);
   }
 };

 // This function takes care to return the data to the originating DataPool
 // A pointer to the originating DataPool is stored just before the actual data.
 template <typename T, int N> static void retData(void *data) {
   ((DataPool<T, N> **)data)[-1]->returnData((T *)data);
 }

 struct ParallelData;
 __thread DataPool<ParallelData, 4> *pdp;

 /// Data structure to store additional information for parallel regions.
 struct ParallelData {

   // Parallel fork is just another barrier, use Barrier[1]

   /// Two addresses for relationships with barriers.
   ompt_tsan_clockid Barrier[2];

   const void *codePtr;

   void *GetParallelPtr() { return &(Barrier[1]); }

   void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }

   ParallelData(const void *codeptr) : codePtr(codeptr) {}
   ~ParallelData() {
     TsanDeleteClock(&(Barrier[0]));
     TsanDeleteClock(&(Barrier[1]));
   }
   // overload new/delete to use DataPool for memory management.
   void *operator new(size_t size) { return pdp->getData(); }
   void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
 };

 static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
   return reinterpret_cast<ParallelData *>(parallel_data->ptr);
 }

 struct Taskgroup;
 __thread DataPool<Taskgroup, 4> *tgp;

 /// Data structure to support stacking of taskgroups and allow synchronization.
 struct Taskgroup {
   /// Its address is used for relationships of the taskgroup's task set.
   ompt_tsan_clockid Ptr;

   /// Reference to the parent taskgroup.
   Taskgroup *Parent;

   Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
   ~Taskgroup() { TsanDeleteClock(&Ptr); }

   void *GetPtr() { return &Ptr; }
   // overload new/delete to use DataPool for memory management.
   void *operator new(size_t size) { return tgp->getData(); }
   void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
 };

 struct TaskData;
 __thread DataPool<TaskData, 4> *tdp;

 /// Data structure to store additional information for tasks.
 struct TaskData {
   /// Its address is used for relationships of this task.
   ompt_tsan_clockid Task;

   /// Child tasks use its address to declare a relationship to a taskwait in
   /// this task.
   ompt_tsan_clockid Taskwait;

   /// Whether this task is currently executing a barrier.
   bool InBarrier;

   /// Whether this task is an included task.
   int TaskType{0};

   /// Index of which barrier to use next.
   char BarrierIndex;

   /// Count how often this structure has been put into child tasks + 1.
   std::atomic_int RefCount;

   /// Reference to the parent that created this task.
   TaskData *Parent;

   /// Reference to the implicit task in the stack above this task.
   TaskData *ImplicitTask;

   /// Reference to the team of this task.
   ParallelData *Team;

   /// Reference to the current taskgroup that this task either belongs to or
   /// that it just created.
   Taskgroup *TaskGroup;

   /// Dependency information for this task.
   ompt_dependence_t *Dependencies;

   /// Number of dependency entries.
   unsigned DependencyCount;

   void *PrivateData;
   size_t PrivateDataSize;

   int execution;
   int freed;

   TaskData(TaskData *Parent, int taskType)
       : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
         Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
         TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
     if (Parent != nullptr) {
       Parent->RefCount++;
       // Copy over pointer to taskgroup. This task may set up its own stack
       // but for now belongs to its parent's taskgroup.
       TaskGroup = Parent->TaskGroup;
     }
   }

   TaskData(ParallelData *Team, int taskType)
       : InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
         Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
         DependencyCount(0), execution(1), freed(0) {}

   ~TaskData() {
     TsanDeleteClock(&Task);
     TsanDeleteClock(&Taskwait);
   }

   bool isIncluded() { return TaskType & ompt_task_undeferred; }
   bool isUntied() { return TaskType & ompt_task_untied; }
   bool isFinal() { return TaskType & ompt_task_final; }
   bool isMergable() { return TaskType & ompt_task_mergeable; }
   bool isMerged() { return TaskType & ompt_task_merged; }

   bool isExplicit() { return TaskType & ompt_task_explicit; }
   bool isImplicit() { return TaskType & ompt_task_implicit; }
   bool isInitial() { return TaskType & ompt_task_initial; }
   bool isTarget() { return TaskType & ompt_task_target; }

   void *GetTaskPtr() { return &Task; }

   void *GetTaskwaitPtr() { return &Taskwait; }
   // overload new/delete to use DataPool for memory management.
   void *operator new(size_t size) { return tdp->getData(); }
   void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
 };

 static inline TaskData *ToTaskData(ompt_data_t *task_data) {
   return reinterpret_cast<TaskData *>(task_data->ptr);
 }

 static inline void *ToInAddr(void *OutAddr) {
   // FIXME: This will give false negatives when a second variable lays directly
   //        behind a variable that only has a width of 1 byte.
   //        Another approach would be to "negate" the address or to flip the
   //        first bit...
   return reinterpret_cast<char *>(OutAddr) + 1;
 }

 /// Store a mutex for each wait_id to resolve race condition with callbacks.
 std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
 std::mutex LocksMutex;

 static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
                                    ompt_data_t *thread_data) {
   pdp = new DataPool<ParallelData, 4>;
   TsanNewMemory(pdp, sizeof(pdp));
   tgp = new DataPool<Taskgroup, 4>;
   TsanNewMemory(tgp, sizeof(tgp));
   tdp = new DataPool<TaskData, 4>;
   TsanNewMemory(tdp, sizeof(tdp));
   thread_data->value = my_next_id();
 }

 static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
   delete pdp;
   delete tgp;
   delete tdp;
 }

 /// OMPT event callbacks for handling parallel regions.

 static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
                                      const ompt_frame_t *parent_task_frame,
                                      ompt_data_t *parallel_data,
                                      uint32_t requested_team_size, int flag,
                                      const void *codeptr_ra) {
   ParallelData *Data = new ParallelData(codeptr_ra);
   parallel_data->ptr = Data;

   TsanHappensBefore(Data->GetParallelPtr());
   if (archer_flags->ignore_serial && ToTaskData(parent_task_data)->isInitial())
     TsanIgnoreWritesEnd();
 }

 static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
                                    ompt_data_t *task_data, int flag,
                                    const void *codeptr_ra) {
   if (archer_flags->ignore_serial && ToTaskData(task_data)->isInitial())
     TsanIgnoreWritesBegin();
   ParallelData *Data = ToParallelData(parallel_data);
   TsanHappensAfter(Data->GetBarrierPtr(0));
   TsanHappensAfter(Data->GetBarrierPtr(1));

   delete Data;

 #if (LLVM_VERSION >= 40)
   if (&__archer_get_omp_status) {
     if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
       __tsan_flush_memory();
   }
 #endif
 }

 static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
                                     ompt_data_t *parallel_data,
                                     ompt_data_t *task_data,
                                     unsigned int team_size,
                                     unsigned int thread_num, int type) {
   switch (endpoint) {
   case ompt_scope_begin:
     if (type & ompt_task_initial) {
       parallel_data->ptr = new ParallelData(nullptr);
     }
     task_data->ptr = new TaskData(ToParallelData(parallel_data), type);
     TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
     TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
     break;
   case ompt_scope_end: {
     TaskData *Data = ToTaskData(task_data);
     assert(Data->freed == 0 && "Implicit task end should only be called once!");
     Data->freed = 1;
     assert(Data->RefCount == 1 &&
            "All tasks should have finished at the implicit barrier!");
     delete Data;
     TsanFuncExit();
     break;
   }
   case ompt_scope_beginend:
     // Should not occur according to OpenMP 5.1
     // Tested in OMPT tests
     break;
   }
 }

 static void ompt_tsan_sync_region(ompt_sync_region_t kind,
                                   ompt_scope_endpoint_t endpoint,
                                   ompt_data_t *parallel_data,
                                   ompt_data_t *task_data,
                                   const void *codeptr_ra) {
   TaskData *Data = ToTaskData(task_data);
   switch (endpoint) {
   case ompt_scope_begin:
   case ompt_scope_beginend:
     TsanFuncEntry(codeptr_ra);
     switch (kind) {
     case ompt_sync_region_barrier_implementation:
     case ompt_sync_region_barrier_implicit:
     case ompt_sync_region_barrier_explicit:
     case ompt_sync_region_barrier_implicit_parallel:
     case ompt_sync_region_barrier_implicit_workshare:
     case ompt_sync_region_barrier_teams:
     case ompt_sync_region_barrier: {
       char BarrierIndex = Data->BarrierIndex;
       TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));

       if (hasReductionCallback < ompt_set_always) {
         // We ignore writes inside the barrier. These would either occur during
         // 1. reductions performed by the runtime which are guaranteed to be
         // race-free.
         // 2. execution of another task.
         // For the latter case we will re-enable tracking in task_switch.
         Data->InBarrier = true;
         TsanIgnoreWritesBegin();
       }

       break;
     }

     case ompt_sync_region_taskwait:
       break;

     case ompt_sync_region_taskgroup:
       Data->TaskGroup = new Taskgroup(Data->TaskGroup);
       break;

     case ompt_sync_region_reduction:
       // should never be reached
       break;
     }
     if (endpoint == ompt_scope_begin)
       break;
     KMP_FALLTHROUGH();
   case ompt_scope_end:
     TsanFuncExit();
     switch (kind) {
     case ompt_sync_region_barrier_implementation:
     case ompt_sync_region_barrier_implicit:
     case ompt_sync_region_barrier_explicit:
     case ompt_sync_region_barrier_implicit_parallel:
     case ompt_sync_region_barrier_implicit_workshare:
     case ompt_sync_region_barrier_teams:
     case ompt_sync_region_barrier: {
       if (hasReductionCallback < ompt_set_always) {
         // We want to track writes after the barrier again.
         Data->InBarrier = false;
         TsanIgnoreWritesEnd();
       }

       char BarrierIndex = Data->BarrierIndex;
       // Barrier will end after it has been entered by all threads.
       if (parallel_data)
         TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));

       // It is not guaranteed that all threads have exited this barrier before
       // we enter the next one. So we will use a different address.
       // We are however guaranteed that this current barrier is finished
       // by the time we exit the next one. So we can then reuse the first
       // address.
       Data->BarrierIndex = (BarrierIndex + 1) % 2;
       break;
     }

     case ompt_sync_region_taskwait: {
       if (Data->execution > 1)
         TsanHappensAfter(Data->GetTaskwaitPtr());
       break;
     }

     case ompt_sync_region_taskgroup: {
       assert(Data->TaskGroup != nullptr &&
              "Should have at least one taskgroup!");

       TsanHappensAfter(Data->TaskGroup->GetPtr());

       // Delete this allocated taskgroup, all descendent task are finished by
       // now.
       Taskgroup *Parent = Data->TaskGroup->Parent;
       delete Data->TaskGroup;
       Data->TaskGroup = Parent;
       break;
     }

     case ompt_sync_region_reduction:
       // Should not occur according to OpenMP 5.1
       // Tested in OMPT tests
       break;
     }
     break;
   }
 }

 static void ompt_tsan_reduction(ompt_sync_region_t kind,
                                 ompt_scope_endpoint_t endpoint,
                                 ompt_data_t *parallel_data,
                                 ompt_data_t *task_data,
                                 const void *codeptr_ra) {
   switch (endpoint) {
   case ompt_scope_begin:
     switch (kind) {
     case ompt_sync_region_reduction:
       TsanIgnoreWritesBegin();
       break;
     default:
       break;
     }
     break;
   case ompt_scope_end:
     switch (kind) {
     case ompt_sync_region_reduction:
       TsanIgnoreWritesEnd();
       break;
     default:
       break;
     }
     break;
   case ompt_scope_beginend:
     // Should not occur according to OpenMP 5.1
     // Tested in OMPT tests
     // Would have no implications for DR detection
     break;
   }
 }

 /// OMPT event callbacks for handling tasks.

 static void ompt_tsan_task_create(
     ompt_data_t *parent_task_data,    /* id of parent task            */
     const ompt_frame_t *parent_frame, /* frame data for parent task   */
     ompt_data_t *new_task_data,       /* id of created task           */
     int type, int has_dependences,
     const void *codeptr_ra) /* pointer to outlined function */
 {
   TaskData *Data;
   assert(new_task_data->ptr == NULL &&
          "Task data should be initialized to NULL");
   if (type & ompt_task_initial) {
     ompt_data_t *parallel_data;
     int team_size = 1;
     ompt_get_parallel_info(0, &parallel_data, &team_size);
     ParallelData *PData = new ParallelData(nullptr);
     parallel_data->ptr = PData;

     Data = new TaskData(PData, type);
     new_task_data->ptr = Data;
   } else if (type & ompt_task_undeferred) {
     Data = new TaskData(ToTaskData(parent_task_data), type);
     new_task_data->ptr = Data;
   } else if (type & ompt_task_explicit || type & ompt_task_target) {
     Data = new TaskData(ToTaskData(parent_task_data), type);
     new_task_data->ptr = Data;

     // Use the newly created address. We cannot use a single address from the
     // parent because that would declare wrong relationships with other
     // sibling tasks that may be created before this task is started!
     TsanHappensBefore(Data->GetTaskPtr());
     ToTaskData(parent_task_data)->execution++;
   }
 }

 static void __ompt_tsan_release_task(TaskData *task) {
   while (task != nullptr && --task->RefCount == 0) {
     TaskData *Parent = task->Parent;
     if (task->DependencyCount > 0) {
       delete[] task->Dependencies;
     }
     delete task;
     task = Parent;
   }
 }

 static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
                                     ompt_task_status_t prior_task_status,
                                     ompt_data_t *second_task_data) {

   //
   //  The necessary action depends on prior_task_status:
   //
   //    ompt_task_early_fulfill = 5,
   //     -> ignored
   //
   //    ompt_task_late_fulfill  = 6,
   //     -> first completed, first freed, second ignored
   //
   //    ompt_task_complete      = 1,
   //    ompt_task_cancel        = 3,
   //     -> first completed, first freed, second starts
   //
   //    ompt_task_detach        = 4,
   //    ompt_task_yield         = 2,
   //    ompt_task_switch        = 7
   //     -> first suspended, second starts
   //

   if (prior_task_status == ompt_task_early_fulfill)
     return;

   TaskData *FromTask = ToTaskData(first_task_data);

   // Legacy handling for missing reduction callback
   if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
     // We want to ignore writes in the runtime code during barriers,
     // but not when executing tasks with user code!
     TsanIgnoreWritesEnd();
   }

   // The late fulfill happens after the detached task finished execution
   if (prior_task_status == ompt_task_late_fulfill)
     TsanHappensAfter(FromTask->GetTaskPtr());

   // task completed execution
   if (prior_task_status == ompt_task_complete ||
       prior_task_status == ompt_task_cancel ||
       prior_task_status == ompt_task_late_fulfill) {
     // Included tasks are executed sequentially, no need to track
     // synchronization
     if (!FromTask->isIncluded()) {
       // Task will finish before a barrier in the surrounding parallel region
       // ...
       ParallelData *PData = FromTask->Team;
       TsanHappensBefore(
           PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));

       // ... and before an eventual taskwait by the parent thread.
       TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());

       if (FromTask->TaskGroup != nullptr) {
         // This task is part of a taskgroup, so it will finish before the
         // corresponding taskgroup_end.
         TsanHappensBefore(FromTask->TaskGroup->GetPtr());
       }
     }

     // release dependencies
     for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
       ompt_dependence_t *Dependency = &FromTask->Dependencies[i];

       // in dependencies block following inout and out dependencies!
       TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
       if (Dependency->dependence_type == ompt_dependence_type_out ||
           Dependency->dependence_type == ompt_dependence_type_inout) {
         TsanHappensBefore(Dependency->variable.ptr);
       }
     }
     // free the previously running task
     __ompt_tsan_release_task(FromTask);
   }

   // For late fulfill of detached task, there is no task to schedule to
   if (prior_task_status == ompt_task_late_fulfill) {
     return;
   }

   TaskData *ToTask = ToTaskData(second_task_data);
   // Legacy handling for missing reduction callback
   if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
     // We re-enter runtime code which currently performs a barrier.
     TsanIgnoreWritesBegin();
   }

   // task suspended
   if (prior_task_status == ompt_task_switch ||
       prior_task_status == ompt_task_yield ||
       prior_task_status == ompt_task_detach) {
     // Task may be resumed at a later point in time.
     TsanHappensBefore(FromTask->GetTaskPtr());
     ToTask->ImplicitTask = FromTask->ImplicitTask;
     assert(ToTask->ImplicitTask != NULL &&
            "A task belongs to a team and has an implicit task on the stack");
   }

   // Handle dependencies on first execution of the task
   if (ToTask->execution == 0) {
     ToTask->execution++;
     for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
       ompt_dependence_t *Dependency = &ToTask->Dependencies[i];

       TsanHappensAfter(Dependency->variable.ptr);
       // in and inout dependencies are also blocked by prior in dependencies!
       if (Dependency->dependence_type == ompt_dependence_type_out ||
           Dependency->dependence_type == ompt_dependence_type_inout) {
         TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
       }
     }
   }
   // 1. Task will begin execution after it has been created.
   // 2. Task will resume after it has been switched away.
   TsanHappensAfter(ToTask->GetTaskPtr());
 }

 static void ompt_tsan_dependences(ompt_data_t *task_data,
                                   const ompt_dependence_t *deps, int ndeps) {
   if (ndeps > 0) {
     // Copy the data to use it in task_switch and task_end.
     TaskData *Data = ToTaskData(task_data);
     Data->Dependencies = new ompt_dependence_t[ndeps];
     std::memcpy(Data->Dependencies, deps, sizeof(ompt_dependence_t) * ndeps);
     Data->DependencyCount = ndeps;

     // This callback is executed before this task is first started.
     TsanHappensBefore(Data->GetTaskPtr());
   }
 }

 /// OMPT event callbacks for handling locking.
 static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
                                      const void *codeptr_ra) {

   // Acquire our own lock to make sure that
   // 1. the previous release has finished.
   // 2. the next acquire doesn't start before we have finished our release.
   LocksMutex.lock();
   std::mutex &Lock = Locks[wait_id];
   LocksMutex.unlock();

   Lock.lock();
   TsanHappensAfter(&Lock);
 }

 static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
                                      const void *codeptr_ra) {
   LocksMutex.lock();
   std::mutex &Lock = Locks[wait_id];
   LocksMutex.unlock();
   TsanHappensBefore(&Lock);

   Lock.unlock();
 }

 // callback , signature , variable to store result , required support level
 #define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                    \
   do {                                                                         \
     ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                \
     result = ompt_set_callback(ompt_callback_##event,                          \
                                (ompt_callback_t)tsan_##event);                 \
     if (result < level)                                                        \
       printf("Registered callback '" #event "' is not supported at " #level    \
              " (%i)\n",                                                        \
              result);                                                          \
   } while (0)

 #define SET_CALLBACK_T(event, type)                                            \
   do {                                                                         \
     int res;                                                                   \
     SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);                \
   } while (0)

 #define SET_CALLBACK(event) SET_CALLBACK_T(event, event)

 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
                                 ompt_data_t *tool_data) {
   const char *options = getenv("TSAN_OPTIONS");
   TsanFlags tsan_flags(options);

   ompt_set_callback_t ompt_set_callback =
       (ompt_set_callback_t)lookup("ompt_set_callback");
   if (ompt_set_callback == NULL) {
     std::cerr << "Could not set callback, exiting..." << std::endl;
     std::exit(1);
   }
   ompt_get_parallel_info =
       (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
   ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");

   if (ompt_get_parallel_info == NULL) {
     fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
                     "exiting...\n");
     exit(1);
   }

   SET_CALLBACK(thread_begin);
   SET_CALLBACK(thread_end);
   SET_CALLBACK(parallel_begin);
   SET_CALLBACK(implicit_task);
   SET_CALLBACK(sync_region);
   SET_CALLBACK(parallel_end);

   SET_CALLBACK(task_create);
   SET_CALLBACK(task_schedule);
   SET_CALLBACK(dependences);

   SET_CALLBACK_T(mutex_acquired, mutex);
   SET_CALLBACK_T(mutex_released, mutex);
   SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
                           ompt_set_never);

   if (!tsan_flags.ignore_noninstrumented_modules)
     fprintf(stderr,
             "Warning: please export "
             "TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
             "to avoid false positive reports from the OpenMP runtime!\n");
   if (archer_flags->ignore_serial)
     TsanIgnoreWritesBegin();
   return 1; // success
 }

 static void ompt_tsan_finalize(ompt_data_t *tool_data) {
   if (archer_flags->ignore_serial)
     TsanIgnoreWritesEnd();
   if (archer_flags->print_max_rss) {
     struct rusage end;
     getrusage(RUSAGE_SELF, &end);
     printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
   }

   if (archer_flags)
     delete archer_flags;
 }

 extern "C" ompt_start_tool_result_t *
 ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
   const char *options = getenv("ARCHER_OPTIONS");
   archer_flags = new ArcherFlags(options);
   if (!archer_flags->enabled) {
     if (archer_flags->verbose)
       std::cout << "Archer disabled, stopping operation" << std::endl;
     delete archer_flags;
     return NULL;
   }

   static ompt_start_tool_result_t ompt_start_tool_result = {
       &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
   runOnTsan = 1;
   RunningOnValgrind();
   if (!runOnTsan) // if we are not running on TSAN, give a different tool the
                   // chance to be loaded
   {
     if (archer_flags->verbose)
       std::cout << "Archer detected OpenMP application without TSan "
                    "stopping operation"
                 << std::endl;
     delete archer_flags;
     return NULL;
   }

   if (archer_flags->verbose)
     std::cout << "Archer detected OpenMP application with TSan, supplying "
                  "OpenMP synchronization semantics"
               << std::endl;
   return &ompt_start_tool_result;
 }