final/libomptarget/deviceRTLs/nvptx/src/loop.cu - openmp - Git at Google

 //===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is dual licensed under the MIT and the University of Illinois Open
 // Source Licenses. See LICENSE.txt for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the implementation of the KMPC interface
 // for the loop construct plus other worksharing constructs that use the same
 // interface as loops.
 //
 //===----------------------------------------------------------------------===//

 #include "omptarget-nvptx.h"

 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 // template class that encapsulate all the helper functions
 //
 // T is loop iteration type (32 | 64)  (unsigned | signed)
 // ST is the signed version of T
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////

 template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
 public:
   ////////////////////////////////////////////////////////////////////////////////
   // Loop with static scheduling with chunk

   // Generic implementation of OMP loop scheduling with static policy
   /*! \brief Calculate initial bounds for static loop and stride
    *  @param[in] loc location in code of the call (not used here)
    *  @param[in] global_tid global thread id
    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
    *  @param[in] plastiter pointer to last iteration
    *  @param[in,out] pointer to loop lower bound. it will contain value of
    *  lower bound of first chunk
    *  @param[in,out] pointer to loop upper bound. It will contain value of
    *  upper bound of first chunk
    *  @param[in,out] pointer to loop stride. It will contain value of stride
    *  between two successive chunks executed by the same thread
    *  @param[in] loop increment bump
    *  @param[in] chunk size
    */

   // helper function for static chunk
   INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
                                     ST chunk, T entityId, T numberOfEntities) {
     // each thread executes multiple chunks all of the same size, except
     // the last one

     // distance between two successive chunks
     stride = numberOfEntities * chunk;
     lb = lb + entityId * chunk;
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
     // Say ub' is the begining of the last chunk. Then who ever has a
     // lower bound plus a multiple of the increment equal to ub' is
     // the last one.
     T beginingLastChunk = inputUb - (inputUb % chunk);
     last = ((beginingLastChunk - lb) % stride) == 0;
   }

   ////////////////////////////////////////////////////////////////////////////////
   // Loop with static scheduling without chunk

   // helper function for static no chunk
   INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
                                       ST &chunk, T entityId,
                                       T numberOfEntities) {
     // No chunk size specified.  Each thread or warp gets at most one
     // chunk; chunks are all almost of equal size
     T loopSize = ub - lb + 1;

     chunk = loopSize / numberOfEntities;
     T leftOver = loopSize - chunk * numberOfEntities;

     if (entityId < leftOver) {
       chunk++;
       lb = lb + entityId * chunk;
     } else {
       lb = lb + entityId * chunk + leftOver;
     }

     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
     last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }

   ////////////////////////////////////////////////////////////////////////////////
   // Support for Static Init

   INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
                                      T *plower, T *pupper, ST *pstride,
                                      ST chunk, bool IsSPMDExecutionMode,
                                      bool IsRuntimeUninitialized) {
     // When IsRuntimeUninitialized is true, we assume that the caller is
     // in an L0 parallel region and that all worker threads participate.

     int tid = GetLogicalThreadIdInBlock();

     // Assume we are in teams region or that we use a single block
     // per target region
     ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
         tid, IsSPMDExecutionMode, IsRuntimeUninitialized);

     // All warps that are in excess of the maximum requested, do
     // not execute the loop
     PRINT(LD_LOOP,
           "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
           "%d, num tids %d\n",
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
           schedtype, P64(chunk),
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                 IsRuntimeUninitialized));
     ASSERT0(
         LT_FUSSY,
         (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
             (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                    IsRuntimeUninitialized)),
         "current thread is not needed here; error");

     // copy
     int lastiter = 0;
     T lb = *plower;
     T ub = *pupper;
     ST stride = *pstride;
     T entityId, numberOfEntities;
     // init
     switch (schedtype) {
     case kmp_sched_static_chunk: {
       if (chunk > 0) {
         entityId =
             GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
         numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                                  IsRuntimeUninitialized);
         ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
         break;
       }
     } // note: if chunk <=0, use nochunk
     case kmp_sched_static_nochunk: {
       entityId =
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                                IsRuntimeUninitialized);
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
       break;
     }
     case kmp_sched_distr_static_chunk: {
       if (chunk > 0) {
         entityId = GetOmpTeamId();
         numberOfEntities = GetNumberOfOmpTeams();
         ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
         break;
       } // note: if chunk <=0, use nochunk
     }
     case kmp_sched_distr_static_nochunk: {
       entityId = GetOmpTeamId();
       numberOfEntities = GetNumberOfOmpTeams();

       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
       break;
     }
     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
       entityId =
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                 IsRuntimeUninitialized) *
               GetOmpTeamId() +
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpTeams() *
                          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                                IsRuntimeUninitialized);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                      numberOfEntities);
       break;
     }
     default: {
       ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
       PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
             schedtype);
       entityId =
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                                IsRuntimeUninitialized);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                      numberOfEntities);
     }
     }
     // copy back
     *plastiter = lastiter;
     *plower = lb;
     *pupper = ub;
     *pstride = stride;
     PRINT(LD_LOOP,
           "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
                                 IsRuntimeUninitialized),
           GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
           P64(*pstride));
   }

   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch Init

   INLINE static int OrderedSchedule(kmp_sched_t schedule) {
     return schedule >= kmp_sched_ordered_first &&
            schedule <= kmp_sched_ordered_last;
   }

   INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
                                    kmp_sched_t schedule, T lb, T ub, ST st,
                                    ST chunk) {
     int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
     T tnum = currTaskDescr->ThreadsInTeam();
     T tripCount = ub - lb + 1; // +1 because ub is inclusive
     ASSERT0(
         LT_FUSSY,
         GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
         "current thread is not needed here; error");

     /* Currently just ignore the monotonic and non-monotonic modifiers
      * (the compiler isn't producing them * yet anyway).
      * When it is we'll want to look at them somewhere here and use that
      * information to add to our schedule choice. We shouldn't need to pass
      * them on, they merely affect which schedule we can legally choose for
      * various dynamic cases. (In paritcular, whether or not a stealing scheme
      * is legal).
      */
     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);

     // Process schedule.
     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
       if (OrderedSchedule(schedule))
         __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
             (long)tnum, P64(tripCount), schedule);
       schedule = kmp_sched_static_chunk;
       chunk = tripCount; // one thread gets the whole loop
     } else if (schedule == kmp_sched_runtime) {
       // process runtime
       omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
       chunk = currTaskDescr->RuntimeChunkSize();
       switch (rtSched) {
       case omp_sched_static: {
         if (chunk > 0)
           schedule = kmp_sched_static_chunk;
         else
           schedule = kmp_sched_static_nochunk;
         break;
       }
       case omp_sched_auto: {
         schedule = kmp_sched_static_chunk;
         chunk = 1;
         break;
       }
       case omp_sched_dynamic:
       case omp_sched_guided: {
         schedule = kmp_sched_dynamic;
         break;
       }
       }
       PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
             P64(chunk));
     } else if (schedule == kmp_sched_auto) {
       schedule = kmp_sched_static_chunk;
       chunk = 1;
       PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
             P64(chunk));
     } else {
       PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
       ASSERT(LT_FUSSY,
              schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
              "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
     }

     // init schedules
     if (schedule == kmp_sched_static_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
       // save sched state
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
       int lastiter = 0;
       ForStaticChunk(
           lastiter, lb, ub, stride, chunk,
           GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
       // save computed params
       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));

     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
       // save sched state
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
       int lastiter = 0;
       ForStaticNoChunk(
           lastiter, lb, ub, stride, chunk,
           GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
       // save computed params
       omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
             "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));

     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
       __kmpc_barrier(loc, threadId);
       // save sched state
       int teamId = GetOmpTeamId();
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       if (GetThreadIdInBlock() == 0) {
         if (chunk < 1)
           chunk = 1;
         omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
         omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
         omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
       }
       __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
             omptarget_nvptx_threadPrivateContext->Chunk(teamId));
     }
   }

   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next

   INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
                                      int64_t &loopLowerBound,
                                      T loopUpperBound) {
     // calculate lower bound for all lanes in the warp
     lb = atomicAdd((unsigned long long *)&loopLowerBound,
                    (unsigned long long)chunkSize);
     ub = lb + chunkSize - 1;  // Clang uses i <= ub

     // 3 result cases:
     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
     //  NOT_FINISHED
     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
     // a.
     if (lb <= loopUpperBound && ub < loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
             P64(ub), P64(loopUpperBound));
       return NOT_FINISHED;
     }
     // b.
     if (lb <= loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
             P64(lb), P64(ub), P64(loopUpperBound));
       ub = loopUpperBound;
       return LAST_CHUNK;
     }
     // c. if we are here, we are in case 'c'
     lb = loopUpperBound + 2;
     ub = loopUpperBound + 1;
     PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
           P64(ub), P64(loopUpperBound));
     return FINISHED;
   }

   // On Pascal, with inlining of the runtime into the user application,
   // this code deadlocks.  This is probably because different threads
   // in a warp cannot make independent progress.
   NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
                                     ST *pstride) {
     // ID of a thread in its own warp

     // automatically selects thread or warp ID based on selected implementation
     int tid = GetLogicalThreadIdInBlock();
     ASSERT0(
         LT_FUSSY,
         GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
         "current thread is not needed here; error");
     // retrieve schedule
     kmp_sched_t schedule =
         omptarget_nvptx_threadPrivateContext->ScheduleType(tid);

     // xxx reduce to one
     if (schedule == kmp_sched_static_chunk ||
         schedule == kmp_sched_static_nochunk) {
       T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
       T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
       // finished?
       if (myLb > ub) {
         PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
               P64(myLb), P64(ub));
         return DISPATCH_FINISHED;
       }
       // not finished, save current bounds
       ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
       *plower = myLb;
       T myUb = myLb + chunk - 1; // Clang uses i <= ub
       if (myUb > ub)
         myUb = ub;
       *pupper = myUb;
       *plast = (int32_t)(myUb == ub);

       // increment next lower bound by the stride
       ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
       PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
             P64(*plower), P64(*pupper));
       return DISPATCH_NOTFINISHED;
     }
     ASSERT0(LT_FUSSY,
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
     T myLb, myUb;
     int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
         myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
         omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
         omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));

     if (finished == FINISHED)
       return DISPATCH_FINISHED;

     // not finished (either not finished or last chunk)
     *plast = (int32_t)(finished == LAST_CHUNK);
     *plower = myLb;
     *pupper = myUb;
     *pstride = 1;

     PRINT(LD_LOOP,
           "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
           GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
           GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
           P64(*pstride));
     return DISPATCH_NOTFINISHED;
   }

   INLINE static void dispatch_fini() {
     // nothing
   }

   ////////////////////////////////////////////////////////////////////////////////
   // end of template class that encapsulate all the helper functions
   ////////////////////////////////////////////////////////////////////////////////
 };

 ////////////////////////////////////////////////////////////////////////////////
 // KMP interface implementation (dyn loops)
 ////////////////////////////////////////////////////////////////////////////////

 // init
 EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
                                    int32_t schedule, int32_t lb, int32_t ub,
                                    int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
                                     int32_t schedule, uint32_t lb, uint32_t ub,
                                     int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
                                    int32_t schedule, int64_t lb, int64_t ub,
                                    int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
                                     int32_t schedule, uint64_t lb, uint64_t ub,
                                     int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }

 // next
 EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last,
                                   int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
       p_last, p_lb, p_ub, p_st);
 }

 EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid,
                                    int32_t *p_last, uint32_t *p_lb,
                                    uint32_t *p_ub, int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
       p_last, p_lb, p_ub, p_st);
 }

 EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last,
                                   int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
       p_last, p_lb, p_ub, p_st);
 }

 EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid,
                                    int32_t *p_last, uint64_t *p_lb,
                                    uint64_t *p_ub, int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
       p_last, p_lb, p_ub, p_st);
 }

 // fini
 EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
 }

 EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
 }

 EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
 }

 EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
 }

 ////////////////////////////////////////////////////////////////////////////////
 // KMP interface implementation (static loops)
 ////////////////////////////////////////////////////////////////////////////////

 EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
                                      int32_t schedtype, int32_t *plastiter,
                                      int32_t *plower, int32_t *pupper,
                                      int32_t *pstride, int32_t incr,
                                      int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
       isRuntimeUninitialized());
 }

 EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
                                       int32_t schedtype, int32_t *plastiter,
                                       uint32_t *plower, uint32_t *pupper,
                                       int32_t *pstride, int32_t incr,
                                       int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
       isRuntimeUninitialized());
 }

 EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
                                      int32_t schedtype, int32_t *plastiter,
                                      int64_t *plower, int64_t *pupper,
                                      int64_t *pstride, int64_t incr,
                                      int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
       isRuntimeUninitialized());
 }

 EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
                                       int32_t schedtype, int32_t *plastiter,
                                       uint64_t *plower, uint64_t *pupper,
                                       int64_t *pstride, int64_t incr,
                                       int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
       isRuntimeUninitialized());
 }

 EXTERN
 void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
                                           int32_t schedtype, int32_t *plastiter,
                                           int32_t *plower, int32_t *pupper,
                                           int32_t *pstride, int32_t incr,
                                           int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/true,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
                                            int32_t schedtype,
                                            int32_t *plastiter, uint32_t *plower,
                                            uint32_t *pupper, int32_t *pstride,
                                            int32_t incr, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/true,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
                                           int32_t schedtype, int32_t *plastiter,
                                           int64_t *plower, int64_t *pupper,
                                           int64_t *pstride, int64_t incr,
                                           int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/true,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
                                            int32_t schedtype,
                                            int32_t *plastiter, uint64_t *plower,
                                            uint64_t *pupper, int64_t *pstride,
                                            int64_t incr, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/true,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_4_simple_generic(
     kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
     int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
     int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/false,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_4u_simple_generic(
     kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
     uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
     int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/false,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_8_simple_generic(
     kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
     int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
     int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/false,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN
 void __kmpc_for_static_init_8u_simple_generic(
     kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
     uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
     int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
       /*IsSPMDExecutionMode=*/false,
       /*IsRuntimeUninitialized=*/true);
 }

 EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
   PRINT0(LD_IO, "call kmpc_for_static_fini\n");
 }

 namespace {
 INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
   int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   // On Volta and newer architectures we require that all lanes in
   // a warp (at least, all present for the kernel launch) participate in the
   // barrier.  This is enforced when launching the parallel region.  An
   // exception is when there are < WARPSIZE workers.  In this case only 1 worker
   // is started, so we don't need a barrier.
   if (NumThreads > 1) {
 #endif
     named_sync(L1_BARRIER, WARPSIZE * NumWarps);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   }
 #endif
 }
 }; // namespace

 EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
                                                   int32_t varNum, void *array) {
   PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");

   omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
   int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
                            isRuntimeUninitialized());
   uint32_t NumThreads = GetNumberOfOmpThreads(
       GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized());
   uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
   for (unsigned i = 0; i < varNum; i++) {
     // Reset buffer.
     if (tid == 0)
       *Buffer = 0; // Reset to minimum loop iteration value.

     // Barrier.
     syncWorkersInGenericMode(NumThreads);

     // Atomic max of iterations.
     uint64_t *varArray = (uint64_t *)array;
     uint64_t elem = varArray[i];
     (void)atomicMax((unsigned long long int *)Buffer,
                     (unsigned long long int)elem);

     // Barrier.
     syncWorkersInGenericMode(NumThreads);

     // Read max value and update thread private array.
     varArray[i] = *Buffer;

     // Barrier.
     syncWorkersInGenericMode(NumThreads);
   }
 }