blob: 086f4c5d2c95809e506d77e657291335808c9ac1 [file] [log] [blame]
//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//
////////////////////////////////////////////////////////////////////////////////
// Task Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
// sched starts from 1..4; encode it as 0..3; so add 1 here
uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
return (omp_sched_t)rc;
}
INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
// sched starts from 1..4; encode it as 0..3; so sub 1 here
uint8_t val = ((uint8_t)sched) - 1;
// clear current sched
items.flags &= ~TaskDescr_SchedMask;
// set new sched
items.flags |= val;
}
INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// not in parallel
items.flags = 0;
items.nthreads = GetNumberOfProcsInTeam();
; // threads: whatever was alloc by kernel
items.threadId = 0; // is master
items.threadsInTeam = 1; // sequential
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
}
// This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
// slow method
// flag:
// default sched is static,
// dyn is off (unused now anyway, but may need to sample from host ?)
// in L1 parallel
items.flags =
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.nthreads = 0; // # threads for subsequent parallel region
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.threadsInTeam = tnum;
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyData(
omptarget_nvptx_TaskDescr *sourceTaskDescr) {
items = sourceTaskDescr->items;
}
INLINE void
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
CopyData(sourceTaskDescr);
prev = sourceTaskDescr->prev;
}
INLINE void omptarget_nvptx_TaskDescr::CopyParent(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyData(parentTaskDescr);
prev = parentTaskDescr;
}
INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
omptarget_nvptx_TaskDescr *parentTaskDescr) {
CopyParent(parentTaskDescr);
items.flags = items.flags & ~TaskDescr_IsParConstr;
ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
}
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
CopyParent(masterTaskDescr);
// overrwrite specific items;
items.flags |=
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.threadsInTeam = tnum; // set number of threads
}
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
omptarget_nvptx_TaskDescr *workTaskDescr) {
Copy(workTaskDescr);
//
// overrwrite specific items;
//
// The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
// This is so that the serial master (first lane in the master warp)
// gets a threadId of 0.
// However, we know that this function is always called in a parallel
// region where only workers are active. The serial master thread
// never enters this region. When a parallel region is executed serially,
// the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
// are called, which never activate this region.
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
}
INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
CopyParent(parentTaskDescr);
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
items.threadsInTeam = tnum; // set number of threads
items.threadId = tid;
}
INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
loopData.loopUpperBound =
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
loopData.nextLowerBound =
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
loopData.schedule =
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
loopData.stride =
omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
}
INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
loopData.loopUpperBound;
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
loopData.nextLowerBound;
omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
loopData.stride;
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
loopData.schedule;
}
////////////////////////////////////////////////////////////////////////////////
// Thread Private Context
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TaskDescr *
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
ASSERT0(
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
"Getting top level, tid is larger than allocated data structure size");
return topTaskDescr[tid];
}
INLINE void
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// levelOneTaskDescr is init when starting the parallel region
// top task descr is NULL (team master version will be fixed separately)
topTaskDescr[tid] = NULL;
// no num threads value has been pushed
nextRegion.tnum[tid] = 0;
// priv counter init to zero
priv[tid] = 0;
// the following don't need to be init here; they are init when using dyn
// sched
// current_Event, events_Number, chunk, num_Iterations, schedule
}
////////////////////////////////////////////////////////////////////////////////
// Work Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() {
cg.Clear(); // start and stop to zero too
// threadsInParallelTeam does not need to be init (done in start parallel)
hasCancel = FALSE;
}
////////////////////////////////////////////////////////////////////////////////
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
levelZeroTaskDescr.InitLevelZeroTaskDescr();
workDescrForActiveParallel.InitWorkDescr();
// omp_init_lock(criticalLock);
}
////////////////////////////////////////////////////////////////////////////////
// Get private data structure for thread
////////////////////////////////////////////////////////////////////////////////
// Utility routines for CUDA threads
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
return omptarget_nvptx_threadPrivateContext->TeamContext();
}
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
return currTeamDescr.WorkDescr();
}
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
}
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}