blob: 33509b6bd05a7a3e92b1208e46e2cc351b2a7731 [file] [log] [blame]
//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// Parallel implemention in the GPU. Here is the pattern:
//
// while (not finished) {
//
// if (master) {
// sequential code, decide which par loop to do, or if finished
// __kmpc_kernel_prepare_parallel() // exec by master only
// }
// syncthreads // A
// __kmpc_kernel_parallel() // exec by all
// if (this thread is included in the parallel) {
// switch () for all parallel loops
// __kmpc_kernel_end_parallel() // exec only by threads in parallel
// }
//
//
// The reason we don't exec end_parallel for the threads not included
// in the parallel loop is that for each barrier in the parallel
// region, these non-included threads will cycle through the
// syncthread A. Thus they must preserve their current threadId that
// is larger than thread in team.
//
// To make a long story short...
//
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
typedef struct ConvergentSimdJob {
omptarget_nvptx_TaskDescr taskDescr;
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
uint16_t slimForNextSimd;
} ConvergentSimdJob;
////////////////////////////////////////////////////////////////////////////////
// support for convergent simd (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
bool *IsFinal, int32_t *LaneSource,
int32_t *LaneId, int32_t *NumLanes) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __ffs(WorkRemaining);
*IsFinal = __popc(WorkRemaining) == 1;
uint32_t lanemask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
*LaneId = __popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock();
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
int32_t SimdLimit =
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
job->slimForNextSimd = SimdLimit;
int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
// reset simdlimit to avoid propagating to successive #simd
if (SimdLimitSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
// We cannot have more than the # of convergent threads.
if (SimdLimitSource > 0)
*NumLanes = min(ConvergentSize, SimdLimitSource);
else
*NumLanes = ConvergentSize;
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
*NumLanes);
// Set to true for lanes participating in the simd region.
bool isActive = false;
// Initialize state for active threads.
if (*LaneId < *NumLanes) {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_TaskDescr *sourceTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
sourceThreadId);
job->convHeadTaskDescr = currTaskDescr;
// install top descriptor from the thread for which the lanes are working.
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
sourceTaskDescr);
isActive = true;
}
// requires a memory fence between threads of a warp
return isActive;
}
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
int threadId = GetLogicalThreadIdInBlock();
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
job->slimForNextSimd;
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, job->convHeadTaskDescr);
}
typedef struct ConvergentParallelJob {
omptarget_nvptx_TaskDescr taskDescr;
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
uint16_t tnumForNextPar;
} ConvergentParallelJob;
////////////////////////////////////////////////////////////////////////////////
// support for convergent parallelism (team of threads in a warp only)
////////////////////////////////////////////////////////////////////////////////
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
bool *IsFinal,
int32_t *LaneSource) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __ffs(WorkRemaining);
*IsFinal = __popc(WorkRemaining) == 1;
uint32_t lanemask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock();
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
int32_t NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
job->tnumForNextPar = NumThreadsClause;
int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
// reset numthreads to avoid propagating to successive #parallel
if (NumThreadsSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
0;
// We cannot have more than the # of convergent threads.
uint16_t NumThreads;
if (NumThreadsSource > 0)
NumThreads = min(ConvergentSize, NumThreadsSource);
else
NumThreads = ConvergentSize;
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
NumThreads);
// Set to true for workers participating in the parallel region.
bool isActive = false;
// Initialize state for active threads.
if (OmpId < NumThreads) {
// init L2 task descriptor and storage for the L1 parallel task descriptor.
omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_TaskDescr *sourceTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
sourceThreadId);
job->convHeadTaskDescr = currTaskDescr;
newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
isActive = true;
}
// requires a memory fence between threads of a warp
return isActive;
}
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
int threadId = GetLogicalThreadIdInBlock();
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, job->convHeadTaskDescr);
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
job->tnumForNextPar;
}
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
// return number of cuda threads that participate to parallel
// calculation has to consider simd implementation in nvptx
// i.e. (num omp threads * num lanes)
//
// cudathreads =
// if(num_threads != 0) {
// if(thread_limit > 0) {
// min (num_threads*numLanes ; thread_limit*numLanes);
// } else {
// min (num_threads*numLanes; blockDim.x)
// }
// } else {
// if (thread_limit != 0) {
// min (thread_limit*numLanes; blockDim.x)
// } else { // no thread_limit, no num_threads, use all cuda threads
// blockDim.x;
// }
// }
//
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
omptarget_nvptx_workFn = WorkFn;
if (!IsOMPRuntimeInitialized)
return;
// This routine is only called by the team master. The team master is
// the first thread of the last warp. It always has the logical thread
// id of 0 (since it is a shadow for the first worker thread).
int threadId = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
"cannot be called in a parallel region.");
if (currTaskDescr->InParallelRegion()) {
PRINT0(LD_PAR, "already in parallel: go seq\n");
return;
}
uint16_t CudaThreadsForParallel = 0;
uint16_t NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
// we cannot have more than block size
uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
// currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
// specified by the thread_limit clause on the target directive.
// GetNumberOfWorkersInTeam(): This is the number of workers available
// in this kernel instance.
//
// E.g: If thread_limit is 33, the kernel is launched with 33+32=65
// threads. The last warp is the master warp so in this case
// GetNumberOfWorkersInTeam() returns 64.
// this is different from ThreadAvail of OpenMP because we may be
// using some of the CUDA threads as SIMD lanes
int NumLanes = 1;
if (NumThreadsClause != 0) {
// reset request to avoid propagating to successive #parallel
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
0;
// assume that thread_limit*numlanes is already <= CudaThreadsAvail
// because that is already checked on the host side (CUDA offloading rtl)
if (currTaskDescr->ThreadLimit() != 0)
CudaThreadsForParallel =
NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
? NumThreadsClause * NumLanes
: currTaskDescr->ThreadLimit() * NumLanes;
else {
CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
? CudaThreadsAvail
: NumThreadsClause * NumLanes;
}
} else {
if (currTaskDescr->ThreadLimit() != 0) {
CudaThreadsForParallel =
(currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
? CudaThreadsAvail
: currTaskDescr->ThreadLimit() * NumLanes;
} else
CudaThreadsForParallel = CudaThreadsAvail;
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp participate in the parallel region. Round down to a
// multiple of WARPSIZE since it is legal to do so in OpenMP.
// CudaThreadsAvail is the number of workers available in this
// kernel instance and is greater than or equal to
// currTaskDescr->ThreadLimit().
if (CudaThreadsForParallel < CudaThreadsAvail) {
CudaThreadsForParallel =
(CudaThreadsForParallel < WARPSIZE)
? 1
: CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
}
#endif
ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
"bad thread request of %d threads", CudaThreadsForParallel);
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"only team master can create parallel");
// set number of threads on work descriptor
// this is different from the number of cuda threads required for the parallel
// region
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
CudaThreadsForParallel / NumLanes);
// init counters (copy start to init)
workDescr.CounterGroup().Reset();
}
// All workers call this function. Deactivate those not needed.
// Fn - the outlined work function to execute.
// returns True if this thread is active, else False.
//
// Only the worker threads call this routine.
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
// Work function and arguments for L1 parallel region.
*WorkFn = omptarget_nvptx_workFn;
if (!IsOMPRuntimeInitialized)
return true;
// If this is the termination signal from the master, quit early.
if (!*WorkFn)
return false;
// Only the worker threads call this routine and the master warp
// never arrives here. Therefore, use the nvptx thread id.
int threadId = GetThreadIdInBlock();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// Set to true for workers participating in the parallel region.
bool isActive = false;
// Initialize state for active threads.
if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
// init work descriptor from workdesccr
omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
// init private from int value
workDescr.CounterGroup().Init(
omptarget_nvptx_threadPrivateContext->Priv(threadId));
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
newTaskDescr->ThreadId(), newTaskDescr->NThreads());
isActive = true;
}
return isActive;
}
EXTERN void __kmpc_kernel_end_parallel() {
// pop stack
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
// Only the worker threads call this routine and the master warp
// never arrives here. Therefore, use the nvptx thread id.
int threadId = GetThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
}
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes sequential
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock();
// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
// get current task
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
currTaskDescr->SaveLoopData();
// allocate new task descriptor and copy value from current one, set prev to
// it
omptarget_nvptx_TaskDescr *newTaskDescr =
(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
(char *)"new seq parallel task");
newTaskDescr->CopyParent(currTaskDescr);
// tweak values for serialized parallel case:
// - each thread becomes ID 0 in its serialized parallel, and
// - there is only one thread per team
newTaskDescr->ThreadId() = 0;
newTaskDescr->ThreadsInTeam() = 1;
// set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
}
EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
// pop stack
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
// free
SafeFree(currTaskDescr, (char *)"new seq parallel task");
currTaskDescr = getMyTopTaskDescriptor(threadId);
currTaskDescr->RestoreLoopData();
}
EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
if (currTaskDescr->InL2OrHigherParallelRegion())
return 2;
else if (currTaskDescr->InParallelRegion())
return 1;
else
return 0;
}
// This kmpc call returns the thread id across all teams. It's value is
// cached by the compiler and used when calling the runtime. On nvptx
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
return GetLogicalThreadIdInBlock();
}
////////////////////////////////////////////////////////////////////////////////
// push params
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
int32_t num_threads) {
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}
EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
int32_t simd_limit) {
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}
// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.
EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
int32_t num_teams, int32_t thread_limit) {
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
ASSERT0(LT_FUSSY, FALSE,
"should never have anything with new teams on device");
}
EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
int proc_bind) {
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
}