| //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Parallel implementation in the GPU. Here is the pattern: |
| // |
| // while (not finished) { |
| // |
| // if (master) { |
| // sequential code, decide which par loop to do, or if finished |
| // __kmpc_kernel_prepare_parallel() // exec by master only |
| // } |
| // syncthreads // A |
| // __kmpc_kernel_parallel() // exec by all |
| // if (this thread is included in the parallel) { |
| // switch () for all parallel loops |
| // __kmpc_kernel_end_parallel() // exec only by threads in parallel |
| // } |
| // |
| // |
| // The reason we don't exec end_parallel for the threads not included |
| // in the parallel loop is that for each barrier in the parallel |
| // region, these non-included threads will cycle through the |
| // syncthread A. Thus they must preserve their current threadId that |
| // is larger than thread in team. |
| // |
| // To make a long story short... |
| // |
| //===----------------------------------------------------------------------===// |
| #pragma omp declare target |
| |
| #include "common/omptarget.h" |
| #include "target_impl.h" |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // support for parallel that goes parallel (1 static level only) |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, |
| uint16_t NThreadsICV, |
| uint16_t ThreadLimit) { |
| uint16_t ThreadsRequested = NThreadsICV; |
| if (NumThreadsClause != 0) { |
| ThreadsRequested = NumThreadsClause; |
| } |
| |
| uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); |
| if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { |
| ThreadsAvailable = ThreadLimit; |
| } |
| |
| uint16_t NumThreads = ThreadsAvailable; |
| if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { |
| NumThreads = ThreadsRequested; |
| } |
| |
| #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 |
| // On Volta and newer architectures we require that all lanes in |
| // a warp participate in the parallel region. Round down to a |
| // multiple of WARPSIZE since it is legal to do so in OpenMP. |
| if (NumThreads < WARPSIZE) { |
| NumThreads = 1; |
| } else { |
| NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); |
| } |
| #endif |
| |
| return NumThreads; |
| } |
| |
| // This routine is always called by the team master.. |
| EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) { |
| PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); |
| |
| omptarget_nvptx_workFn = WorkFn; |
| |
| // This routine is only called by the team master. The team master is |
| // the first thread of the last warp. It always has the logical thread |
| // id of 0 (since it is a shadow for the first worker thread). |
| const int threadId = 0; |
| omptarget_nvptx_TaskDescr *currTaskDescr = |
| omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); |
| ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); |
| ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), |
| "cannot be called in a parallel region."); |
| if (currTaskDescr->InParallelRegion()) { |
| PRINT0(LD_PAR, "already in parallel: go seq\n"); |
| return; |
| } |
| |
| uint16_t &NumThreadsClause = |
| omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); |
| |
| uint16_t NumThreads = |
| determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); |
| |
| if (NumThreadsClause != 0) { |
| // Reset request to avoid propagating to successive #parallel |
| NumThreadsClause = 0; |
| } |
| |
| ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", |
| (int)NumThreads); |
| ASSERT0(LT_FUSSY, |
| __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), |
| "only team master can create parallel"); |
| |
| // Set number of threads on work descriptor. |
| omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); |
| workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); |
| threadsInTeam = NumThreads; |
| } |
| |
| // All workers call this function. Deactivate those not needed. |
| // Fn - the outlined work function to execute. |
| // returns True if this thread is active, else False. |
| // |
| // Only the worker threads call this routine. |
| EXTERN bool __kmpc_kernel_parallel(void **WorkFn) { |
| PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); |
| |
| // Work function and arguments for L1 parallel region. |
| *WorkFn = omptarget_nvptx_workFn; |
| |
| // If this is the termination signal from the master, quit early. |
| if (!*WorkFn) { |
| PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n"); |
| return false; |
| } |
| |
| // Only the worker threads call this routine and the master warp |
| // never arrives here. Therefore, use the nvptx thread id. |
| int threadId = __kmpc_get_hardware_thread_id_in_block(); |
| omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); |
| // Set to true for workers participating in the parallel region. |
| bool isActive = false; |
| // Initialize state for active threads. |
| if (threadId < threadsInTeam) { |
| // init work descriptor from workdesccr |
| omptarget_nvptx_TaskDescr *newTaskDescr = |
| omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); |
| ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); |
| newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); |
| // install new top descriptor |
| omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, |
| newTaskDescr); |
| // init private from int value |
| PRINT(LD_PAR, |
| "thread will execute parallel region with id %d in a team of " |
| "%d threads\n", |
| (int)newTaskDescr->ThreadId(), (int)nThreads); |
| |
| isActive = true; |
| } |
| |
| return isActive; |
| } |
| |
| EXTERN void __kmpc_kernel_end_parallel() { |
| // pop stack |
| PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); |
| |
| // Only the worker threads call this routine and the master warp |
| // never arrives here. Therefore, use the nvptx thread id. |
| int threadId = __kmpc_get_hardware_thread_id_in_block(); |
| omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); |
| omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( |
| threadId, currTaskDescr->GetPrevTaskDescr()); |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // support for parallel that goes sequential |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) { |
| PRINT0(LD_IO, "call to serializedParallel\n"); |
| |
| IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); |
| |
| if (isRuntimeUninitialized()) { |
| ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), |
| "Expected SPMD mode with uninitialized runtime."); |
| return; |
| } |
| |
| // assume this is only called for nested parallel |
| int threadId = GetLogicalThreadIdInBlock(); |
| |
| // unlike actual parallel, threads in the same team do not share |
| // the workTaskDescr in this case and num threads is fixed to 1 |
| |
| // get current task |
| omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); |
| currTaskDescr->SaveLoopData(); |
| |
| // allocate new task descriptor and copy value from current one, set prev to |
| // it |
| omptarget_nvptx_TaskDescr *newTaskDescr = |
| (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), |
| "new seq parallel task"); |
| newTaskDescr->CopyParent(currTaskDescr); |
| |
| // tweak values for serialized parallel case: |
| // - each thread becomes ID 0 in its serialized parallel, and |
| // - there is only one thread per team |
| newTaskDescr->ThreadId() = 0; |
| |
| // set new task descriptor as top |
| omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, |
| newTaskDescr); |
| } |
| |
| static void endSerializedParallel(kmp_Ident *loc, |
| uint32_t global_tid) { |
| PRINT0(LD_IO, "call to endSerializedParallel\n"); |
| |
| DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); |
| |
| if (isRuntimeUninitialized()) { |
| ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), |
| "Expected SPMD mode with uninitialized runtime."); |
| return; |
| } |
| |
| // pop stack |
| int threadId = GetLogicalThreadIdInBlock(); |
| omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); |
| // set new top |
| omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( |
| threadId, currTaskDescr->GetPrevTaskDescr()); |
| // free |
| SafeFree(currTaskDescr, "new seq parallel task"); |
| currTaskDescr = getMyTopTaskDescriptor(threadId); |
| currTaskDescr->RestoreLoopData(); |
| } |
| |
| NOINLINE EXTERN uint8_t __kmpc_parallel_level() { |
| return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); |
| } |
| |
| // This kmpc call returns the thread id across all teams. It's value is |
| // cached by the compiler and used when calling the runtime. On nvptx |
| // it's cheap to recalculate this value so we never use the result |
| // of this call. |
| EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { |
| return GetOmpThreadId(); |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // push params |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, |
| int32_t num_threads) { |
| PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), |
| "Runtime must be initialized."); |
| tid = GetLogicalThreadIdInBlock(); |
| omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = |
| num_threads; |
| } |
| |
| // Do nothing. The host guarantees we started the requested number of |
| // teams and we only need inspection of gridDim. |
| |
| EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, |
| int32_t num_teams, int32_t thread_limit) { |
| PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); |
| ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device"); |
| } |
| |
| EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) { |
| PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // parallel interface |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid, |
| kmp_int32 if_expr, |
| kmp_int32 num_threads, int proc_bind, |
| void *fn, void *wrapper_fn, void **args, |
| size_t nargs) { |
| // Handle the serialized case first, same for SPMD/non-SPMD except that in |
| // SPMD mode we already incremented the parallel level counter, account for |
| // that. |
| bool InParallelRegion = |
| (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode()); |
| if (!if_expr || InParallelRegion) { |
| serializedParallel(ident, global_tid); |
| __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); |
| endSerializedParallel(ident, global_tid); |
| return; |
| } |
| |
| if (__kmpc_is_spmd_exec_mode()) { |
| __kmp_invoke_microtask(global_tid, 0, fn, args, nargs); |
| return; |
| } |
| |
| // Handle the num_threads clause. |
| if (num_threads != -1) |
| __kmpc_push_num_threads(ident, global_tid, num_threads); |
| |
| __kmpc_kernel_prepare_parallel((void *)wrapper_fn); |
| |
| if (nargs) { |
| void **GlobalArgs; |
| __kmpc_begin_sharing_variables(&GlobalArgs, nargs); |
| // TODO: faster memcpy? |
| #pragma unroll |
| for (int I = 0; I < nargs; I++) |
| GlobalArgs[I] = args[I]; |
| } |
| |
| // TODO: what if that's a parallel region with a single thread? this is |
| // considered not active in the existing implementation. |
| bool IsActiveParallelRegion = threadsInTeam != 1; |
| int NumWarps = |
| threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0); |
| // Increment parallel level for non-SPMD warps. |
| for (int I = 0; I < NumWarps; ++I) |
| parallelLevel[I] += |
| (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); |
| |
| // Master signals work to activate workers. |
| __kmpc_barrier_simple_spmd(ident, 0); |
| |
| // OpenMP [2.5, Parallel Construct, p.49] |
| // There is an implied barrier at the end of a parallel region. After the |
| // end of a parallel region, only the master thread of the team resumes |
| // execution of the enclosing task region. |
| // |
| // The master waits at this barrier until all workers are done. |
| __kmpc_barrier_simple_spmd(ident, 0); |
| |
| // Decrement parallel level for non-SPMD warps. |
| for (int I = 0; I < NumWarps; ++I) |
| parallelLevel[I] -= |
| (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); |
| // TODO: Is synchronization needed since out of parallel execution? |
| |
| if (nargs) |
| __kmpc_end_sharing_variables(); |
| |
| // TODO: proc_bind is a noop? |
| // if (proc_bind != proc_bind_default) |
| // __kmpc_push_proc_bind(ident, global_tid, proc_bind); |
| } |
| |
| #pragma omp end declare target |