[OPENMP][NVPTX]Fix dynamic scheduling.

Summary:
Previous implementation may cause the runtime crash when the number of
teams is > 1024. Patch fixes this problem + reduces number of the atomic
operations by 32 times.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, openmp-commits, caomhin

Differential Revision: https://reviews.llvm.org/D56332

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350524 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index b8a61a4..998ce54 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -352,18 +352,18 @@
                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
                 tid));
-
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      __kmpc_barrier(loc, threadId);
-      // save sched state
-      int teamId = GetOmpTeamId();
+      // save data
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (GetThreadIdInBlock() == 0) {
-        if (chunk < 1)
-          chunk = 1;
-        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __threadfence_block();
       }
       __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
@@ -371,21 +371,45 @@
             ", chunk %" PRIu64 "\n",
             (int)tnum,
             (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
-            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
     }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next
 
+  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+    int lo, hi;
+    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    hi = __SHFL_SYNC(active, hi, leader);
+    lo = __SHFL_SYNC(active, lo, leader);
+    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+    return val;
+  }
+
+  INLINE static uint64_t NextIter() {
+    unsigned int active = __ACTIVEMASK();
+    int leader = __ffs(active) - 1;
+    int change = __popc(active);
+    unsigned lane_mask_lt;
+    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+    unsigned int rank = __popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomicAdd(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
   INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     int64_t &loopLowerBound,
-                                     T loopUpperBound) {
-    // calculate lower bound for all lanes in the warp
-    lb = atomicAdd((unsigned long long *)&loopLowerBound,
-                   (unsigned long long)chunkSize);
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
@@ -461,11 +485,10 @@
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
     T myLb, myUb;
-    int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
 
     if (finished == FINISHED)
       return DISPATCH_FINISHED;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 7a05d93..cb6c0b7 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -344,6 +344,7 @@
   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
 
   INLINE void InitThreadPrivateContext(int tid);
+  INLINE uint64_t &Cnt() { return cnt; }
 
 private:
   // team context for this team
@@ -366,6 +367,7 @@
   // state for dispatch with dyn/guided OR static (never use both at a time)
   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
   int64_t stride[MAX_THREADS_PER_TEAM];
+  uint64_t cnt;
 };
 
 /// Device envrionment data