[CUDA]Fix dynamic|guided scheduling. The existing implementation of the dynamic scheduling breaks the contract introduced by the original openmp runtime and, thus, is incorrect. Patch fixes it and introduces correct dynamic scheduling model. Thanks to Alexey Bataev for submitting this patch. Differential Revision: https://reviews.llvm.org/D47333 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@333225 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 307b6fcfcd1dd8983e77d8fc83f913ddc55b7a5f [log] [tgz]
author: George Rokos <grokos@us.ibm.com> Thu May 24 21:12:41 2018 +0000
committer: George Rokos <grokos@us.ibm.com> Thu May 24 21:12:41 2018 +0000
tree: 5ae74aed6f7a59d2e9c5efb491580e2b48878ac5
parent: abedd0661b48ab4541cd7fbbf68de130798c4b16 [diff]
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index e764752..642516d 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu

@@ -215,7 +215,8 @@
            schedule <= kmp_sched_ordered_last;
   }
 
-  INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+  INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
                                    ST chunk) {
     int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@@ -282,18 +283,15 @@
              "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
     }
 
-    // save sched state
-    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-
     // init schedules
     if (schedule == kmp_sched_static_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -301,8 +299,8 @@
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@@ -310,11 +308,12 @@
 
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -322,45 +321,50 @@
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (chunk < 1)
-        chunk = 1;
-      Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
-      // but each thread (but one) must discover that it is last
-      eventNum += tnum;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (GetThreadIdInBlock() == 0) {
+        if (chunk < 1)
+          chunk = 1;
+        int teamId = GetOmpTeamId();
+        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      }
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
-            "events number = %llu\n",
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid),
-            omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
     }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next
 
-  INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
-                                     Counter priv, T &lb, T &ub,
-                                     Counter &chunkId, Counter &currentEvent,
-                                     T chunkSize, T loopUpperBound) {
-    // get next event atomically
-    Counter nextEvent = cg.Next();
-    // calculate chunk Id (priv was initialized upon entering the loop to
-    // 'start' == 'event')
-    chunkId = nextEvent - priv;
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     Counter &loopLowerBound,
+                                     T loopUpperBound) {
     // calculate lower bound for all lanes in the warp
-    lb = chunkId * chunkSize; // this code assume normalization of LB
+    lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
@@ -368,9 +372,8 @@
     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
     //  NOT_FINISHED
     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    currentEvent = nextEvent;
     // a.
-    if (ub <= loopUpperBound) {
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
             P64(ub), P64(loopUpperBound));
       return NOT_FINISHED;
@@ -383,7 +386,8 @@
       return LAST_CHUNK;
     }
     // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 1;
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
     PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
           P64(ub), P64(loopUpperBound));
     return FINISHED;
@@ -437,29 +441,18 @@
     ASSERT0(LT_FUSSY,
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
-    omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
     T myLb, myUb;
-    Counter chunkId;
-    // xxx current event is now local
-    omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+    int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
-        cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
-        chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
-        omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
 
-    if (finished == FINISHED) {
-      cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                  omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
-      cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                 omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
-
+    if (finished == FINISHED)
       return DISPATCH_FINISHED;
-    }
 
     // not finished (either not finished or last chunk)
-    *plast = (int32_t)(
-        myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    *plast = (int32_t)(finished == LAST_CHUNK);
     *plower = myLb;
     *pupper = myUb;
     *pstride = 1;
@@ -491,7 +484,7 @@
                                    int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@@ -499,7 +492,7 @@
                                     int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@@ -507,7 +500,7 @@
                                    int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@@ -515,7 +508,7 @@
                                     int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 // next
commit	307b6fcfcd1dd8983e77d8fc83f913ddc55b7a5f	[log] [tgz]
author	George Rokos <grokos@us.ibm.com>	Thu May 24 21:12:41 2018 +0000
committer	George Rokos <grokos@us.ibm.com>	Thu May 24 21:12:41 2018 +0000
tree	5ae74aed6f7a59d2e9c5efb491580e2b48878ac5
parent	abedd0661b48ab4541cd7fbbf68de130798c4b16 [diff]