blob: 84c61f97f109031575a124343a54c4fc9e619632 [file] [log] [blame]
//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//
#ifndef __OMPTARGET_NVPTX_H
#define __OMPTARGET_NVPTX_H
// std includes
#include <stdint.h>
#include <stdlib.h>
#include <inttypes.h>
// cuda includes
#include <cuda.h>
#include <math.h>
// local includes
#include "counter_group.h"
#include "debug.h" // debug
#include "interface.h" // interfaces with omp, compiler, and user
#include "option.h" // choices we have
#include "state-queue.h"
#include "support.h"
#define OMPTARGET_NVPTX_VERSION 1.1
// used by the library for the interface with the app
#define DISPATCH_FINISHED 0
#define DISPATCH_NOTFINISHED 1
// used by dynamic scheduling
#define FINISHED 0
#define NOT_FINISHED 1
#define LAST_CHUNK 2
#define BARRIER_COUNTER 0
#define ORDERED_COUNTER 1
// Macros for Cuda intrinsics
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down_sync((mask), (var), (delta), (width))
#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
#define __ACTIVEMASK() __activemask()
#else
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down((var), (delta), (width))
#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
#define __ACTIVEMASK() __ballot(1)
#endif
// arguments needed for L0 parallelism only.
class omptarget_nvptx_SharedArgs {
public:
// All these methods must be called by the master thread only.
INLINE void Init() {
args = buffer;
nArgs = MAX_SHARED_ARGS;
}
INLINE void DeInit() {
// Free any memory allocated for outlined parallel function with a large
// number of arguments.
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, (char *)"new extended args");
Init();
}
}
INLINE void EnsureSize(size_t size) {
if (size > nArgs) {
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, (char *)"new extended args");
}
args = (void **) SafeMalloc(size * sizeof(void *),
(char *)"new extended args");
nArgs = size;
}
}
// Called by all threads.
INLINE void **GetArgs() { return args; };
private:
// buffer of pre-allocated arguments.
void *buffer[MAX_SHARED_ARGS];
// pointer to arguments buffer.
// starts off as a pointer to 'buffer' but can be dynamically allocated.
void **args;
// starts off as MAX_SHARED_ARGS but can increase in size.
uint32_t nArgs;
};
extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
DS_Max_Worker_Threads = 992,
// The size reserved for data in a shared memory slot.
DS_Slot_Size = 256,
// The slot size that should be reserved for a working warp.
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
// The maximum number of warps in use
DS_Max_Warp_Number = 32,
};
// Data structure to keep in shared memory that traces the current slot, stack,
// and frame pointer as well as the active threads that didn't exit the current
// environment.
struct DataSharingStateTy {
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
void *StackPtr[DS_Max_Warp_Number];
void *FramePtr[DS_Max_Warp_Number];
int32_t ActiveThreads[DS_Max_Warp_Number];
};
// Additional worker slot type which is initialized with the default worker slot
// size of 4*32 bytes.
struct __kmpc_data_sharing_worker_slot_static {
__kmpc_data_sharing_slot *Next;
__kmpc_data_sharing_slot *Prev;
void *PrevSlotStackPtr;
void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];
};
// Additional master slot type which is initialized with the default master slot
// size of 4 bytes.
struct __kmpc_data_sharing_master_slot_static {
__kmpc_data_sharing_slot *Next;
__kmpc_data_sharing_slot *Prev;
void *PrevSlotStackPtr;
void *DataEnd;
char Data[DS_Slot_Size];
};
extern __device__ __shared__ DataSharingStateTy DataSharingState;
////////////////////////////////////////////////////////////////////////////////
// task ICV and (implicit & explicit) task state
class omptarget_nvptx_TaskDescr {
public:
// methods for flags
INLINE omp_sched_t GetRuntimeSched();
INLINE void SetRuntimeSched(omp_sched_t sched);
INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; }
INLINE void SetDynamic() {
items.flags = items.flags | TaskDescr_IsDynamic;
}
INLINE void ClearDynamic() {
items.flags = items.flags & (~TaskDescr_IsDynamic);
}
INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
INLINE int InL2OrHigherParallelRegion() {
return items.flags & TaskDescr_InParL2P;
}
INLINE int IsParallelConstruct() {
return items.flags & TaskDescr_IsParConstr;
}
INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
// methods for other fields
INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
prev = taskDescr;
}
// init & copy
INLINE void InitLevelZeroTaskDescr();
INLINE void InitLevelOneTaskDescr(uint16_t tnum,
omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
uint16_t tnum);
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
uint16_t tid, uint16_t tnum);
INLINE void SaveLoopData();
INLINE void RestoreLoopData() const;
private:
// bits for flags: (7 used, 1 free)
// 3 bits (SchedMask) for runtime schedule
// 1 bit (IsDynamic) for dynamic schedule (false = static)
// 1 bit (InPar) if this thread has encountered one or more parallel region
// 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
// 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
// region
static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
static const uint8_t TaskDescr_IsDynamic = 0x8;
static const uint8_t TaskDescr_InPar = 0x10;
static const uint8_t TaskDescr_IsParConstr = 0x20;
static const uint8_t TaskDescr_InParL2P = 0x40;
struct SavedLoopDescr_items {
int64_t loopUpperBound;
int64_t nextLowerBound;
int64_t chunk;
int64_t stride;
kmp_sched_t schedule;
} loopData;
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadlimit; // thread limit ICV
uint16_t threadId; // thread id
uint16_t threadsInTeam; // threads in current team
uint64_t runtimeChunkSize; // runtime chunk size
} items;
omptarget_nvptx_TaskDescr *prev;
};
// build on kmp
typedef struct omptarget_nvptx_ExplicitTaskDescr {
omptarget_nvptx_TaskDescr
taskDescr; // omptarget_nvptx task description (must be first)
kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
} omptarget_nvptx_ExplicitTaskDescr;
////////////////////////////////////////////////////////////////////////////////
// Descriptor of a parallel region (worksharing in general)
class omptarget_nvptx_WorkDescr {
public:
// access to data
INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; }
INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
// init
INLINE void InitWorkDescr();
private:
omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
omptarget_nvptx_TaskDescr masterTaskICV;
bool hasCancel;
};
////////////////////////////////////////////////////////////////////////////////
class omptarget_nvptx_TeamDescr {
public:
// access to data
INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
return &levelZeroTaskDescr;
}
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
return workDescrForActiveParallel;
}
INLINE omp_lock_t *CriticalLock() { return &criticalLock; }
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
INLINE void InitTeamDescr();
INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
// If this is invoked by the master thread of the master warp then intialize
// it with a smaller slot.
if (IsMasterThread) {
// Do not initalize this slot again if it has already been initalized.
if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
return 0;
// Initialize the pointer to the end of the slot given the size of the
// data section. DataEnd is non-inclusive.
master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
// We currently do not have a next slot.
master_rootS[0].Next = 0;
master_rootS[0].Prev = 0;
master_rootS[0].PrevSlotStackPtr = 0;
return (__kmpc_data_sharing_slot *)&master_rootS[0];
}
// Do not initalize this slot again if it has already been initalized.
if (worker_rootS[wid].DataEnd ==
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
return 0;
// Initialize the pointer to the end of the slot given the size of the data
// section. DataEnd is non-inclusive.
worker_rootS[wid].DataEnd =
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
// We currently do not have a next slot.
worker_rootS[wid].Next = 0;
worker_rootS[wid].Prev = 0;
worker_rootS[wid].PrevSlotStackPtr = 0;
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}
INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
worker_rootS[wid].DataEnd =
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
// We currently do not have a next slot.
worker_rootS[wid].Next = 0;
worker_rootS[wid].Prev = 0;
worker_rootS[wid].PrevSlotStackPtr = 0;
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}
private:
omptarget_nvptx_TaskDescr
levelZeroTaskDescr; // icv for team master initial thread
omptarget_nvptx_WorkDescr
workDescrForActiveParallel; // one, ONLY for the active par
omp_lock_t criticalLock;
uint64_t lastprivateIterBuffer;
__align__(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
__align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};
////////////////////////////////////////////////////////////////////////////////
// thread private data (struct of arrays for better coalescing)
// tid refers here to the global thread id
// do not support multiple concurrent kernel a this time
class omptarget_nvptx_ThreadPrivateContext {
public:
// task
INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
return &levelOneTaskDescr[tid];
}
INLINE void SetTopLevelTaskDescr(int tid,
omptarget_nvptx_TaskDescr *taskICV) {
topTaskDescr[tid] = taskICV;
}
INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
// parallel
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
return nextRegion.tnum[tid];
}
// simd
INLINE uint16_t &SimdLimitForNextSimd(int tid) {
return nextRegion.slim[tid];
}
// sync
INLINE Counter &Priv(int tid) { return priv[tid]; }
INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; }
// schedule (for dispatch)
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
INLINE int64_t &Stride(int tid) { return stride[tid]; }
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
INLINE void InitThreadPrivateContext(int tid);
INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
INLINE uint64_t GetSourceQueue() { return SourceQueue; }
private:
// team context for this team
omptarget_nvptx_TeamDescr teamContext;
// task ICV for implict threads in the only parallel region
omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
// pointer where to find the current task ICV (top of the stack)
omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
union {
// Only one of the two is live at the same time.
// parallel
uint16_t tnum[MAX_THREADS_PER_TEAM];
// simd limit
uint16_t slim[MAX_THREADS_PER_TEAM];
} nextRegion;
// sync
Counter priv[MAX_THREADS_PER_TEAM];
// schedule (for dispatch)
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
int64_t chunk[MAX_THREADS_PER_TEAM];
int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
// state for dispatch with dyn/guided OR static (never use both at a time)
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
int64_t stride[MAX_THREADS_PER_TEAM];
// Queue to which this object must be returned.
uint64_t SourceQueue;
};
/// Device envrionment data
struct omptarget_device_environmentTy {
int32_t debug_level;
};
////////////////////////////////////////////////////////////////////////////////
// global device envrionment
////////////////////////////////////////////////////////////////////////////////
extern __device__ omptarget_device_environmentTy omptarget_device_environment;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// global data tables
////////////////////////////////////////////////////////////////////////////////
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
extern __device__ __shared__ uint32_t execution_param;
extern __device__ __shared__ void *ReductionScratchpadPtr;
////////////////////////////////////////////////////////////////////////////////
// work function (outlined parallel/simd functions) and arguments.
// needed for L1 parallelism only.
////////////////////////////////////////////////////////////////////////////////
typedef void *omptarget_nvptx_WorkFn;
extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
omptarget_nvptx_workFn;
////////////////////////////////////////////////////////////////////////////////
// get private data structures
////////////////////////////////////////////////////////////////////////////////
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
////////////////////////////////////////////////////////////////////////////////
// inlined implementation
////////////////////////////////////////////////////////////////////////////////
#include "counter_groupi.h"
#include "omptarget-nvptxi.h"
#include "supporti.h"
#endif