[OPENMP]Fix threadid in __kmpc_omp_taskwait call for dependent target calls.

Summary:
We used to call __kmpc_omp_taskwait function with global threadid set to
0. It may crash the application at the runtime if the thread executing
 target region is not a master thread.

Reviewers: grokos, kkwli0

Subscribers: guansong, jdoerfert, caomhin, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D64571

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@366220 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index 52850ee..32afe3f 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -128,7 +128,7 @@
     int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList) {
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, 0);
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
   __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
                           arg_types);
@@ -181,7 +181,7 @@
     int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList) {
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, 0);
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
   __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
                         arg_types);
@@ -214,7 +214,7 @@
     int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
     int32_t noAliasDepNum, void *noAliasDepList) {
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, 0);
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
   __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
                            arg_types);
@@ -255,7 +255,7 @@
     int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList) {
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, 0);
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
   return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
                       arg_types);
@@ -298,7 +298,7 @@
     int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
     void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
   if (depNum + noAliasDepNum > 0)
-    __kmpc_omp_taskwait(NULL, 0);
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
 
   return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
                             arg_sizes, arg_types, team_num, thread_limit);
diff --git a/libomptarget/test/offloading/target_depend_nowait.cpp b/libomptarget/test/offloading/target_depend_nowait.cpp
new file mode 100644
index 0000000..2c1c7e7
--- /dev/null
+++ b/libomptarget/test/offloading/target_depend_nowait.cpp
@@ -0,0 +1,62 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 1024
+
+int A[N];
+int B[N];
+int C[N];
+int main() {
+  for (int i = 0; i < N; i++)
+    A[i] = B[i] = i;
+
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1) {
+// map data A & B and move to
+#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait
+
+// no data move since already mapped
+#pragma omp target map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// no data move since already mapped
+#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// A updated via update
+#pragma omp target update from(A) depend(out : A[0]) nowait
+
+// B updated via exit, A just released
+#pragma omp target exit data map(release                                       \
+                                 : A) map(from                                 \
+                                          : B) depend(out                      \
+                                                      : A[0]) nowait
+    } // if
+  }   // parallel
+
+  int Sum = 0;
+  for (int i = 0; i < N; i++)
+    Sum += A[i] + B[i];
+  // Sum is 2 * N * (2 + N - 1 + 2) / 2
+  // CHECK: Sum = 1051648.
+  printf("Sum = %d.\n", Sum);
+
+  return Sum != 2 * N * (2 + N - 1 + 2) / 2;
+}
+