| //===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the declarations of all library macros, types, |
| // and functions. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef __OMPTARGET_NVPTX_H |
| #define __OMPTARGET_NVPTX_H |
| |
| // std includes |
| #include <stdint.h> |
| #include <stdlib.h> |
| |
| #include <inttypes.h> |
| |
| // cuda includes |
| #include <cuda.h> |
| #include <math.h> |
| |
| // local includes |
| #include "debug.h" // debug |
| #include "interface.h" // interfaces with omp, compiler, and user |
| #include "option.h" // choices we have |
| #include "state-queue.h" |
| #include "support.h" |
| |
| #define OMPTARGET_NVPTX_VERSION 1.1 |
| |
| // used by the library for the interface with the app |
| #define DISPATCH_FINISHED 0 |
| #define DISPATCH_NOTFINISHED 1 |
| |
| // used by dynamic scheduling |
| #define FINISHED 0 |
| #define NOT_FINISHED 1 |
| #define LAST_CHUNK 2 |
| |
| #define BARRIER_COUNTER 0 |
| #define ORDERED_COUNTER 1 |
| |
| // Macros for Cuda intrinsics |
| // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. |
| // Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). |
| #ifndef CUDA_VERSION |
| #error CUDA_VERSION macro is undefined, something wrong with cuda. |
| #elif CUDA_VERSION >= 9000 |
| #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ |
| __shfl_down_sync((mask), (var), (delta), (width)) |
| #define __ACTIVEMASK() __activemask() |
| #define __SYNCWARP(Mask) __syncwarp(Mask) |
| #else |
| #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ |
| __shfl_down((var), (delta), (width)) |
| #define __ACTIVEMASK() __ballot(1) |
| // In Cuda < 9.0 no need to sync threads in warps. |
| #define __SYNCWARP(Mask) |
| #endif // CUDA_VERSION |
| |
| #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); |
| // Use original __syncthreads if compiled by nvcc or clang >= 9.0. |
| #if !defined(__clang__) || __clang_major__ >= 9 |
| #define __SYNCTHREADS() __syncthreads() |
| #else |
| #define __SYNCTHREADS() __SYNCTHREADS_N(0) |
| #endif |
| |
| // arguments needed for L0 parallelism only. |
| class omptarget_nvptx_SharedArgs { |
| public: |
| // All these methods must be called by the master thread only. |
| INLINE void Init() { |
| args = buffer; |
| nArgs = MAX_SHARED_ARGS; |
| } |
| INLINE void DeInit() { |
| // Free any memory allocated for outlined parallel function with a large |
| // number of arguments. |
| if (nArgs > MAX_SHARED_ARGS) { |
| SafeFree(args, (char *)"new extended args"); |
| Init(); |
| } |
| } |
| INLINE void EnsureSize(size_t size) { |
| if (size > nArgs) { |
| if (nArgs > MAX_SHARED_ARGS) { |
| SafeFree(args, (char *)"new extended args"); |
| } |
| args = (void **) SafeMalloc(size * sizeof(void *), |
| (char *)"new extended args"); |
| nArgs = size; |
| } |
| } |
| // Called by all threads. |
| INLINE void **GetArgs() const { return args; }; |
| private: |
| // buffer of pre-allocated arguments. |
| void *buffer[MAX_SHARED_ARGS]; |
| // pointer to arguments buffer. |
| // starts off as a pointer to 'buffer' but can be dynamically allocated. |
| void **args; |
| // starts off as MAX_SHARED_ARGS but can increase in size. |
| uint32_t nArgs; |
| }; |
| |
| extern __device__ __shared__ omptarget_nvptx_SharedArgs |
| omptarget_nvptx_globalArgs; |
| |
| // Data sharing related quantities, need to match what is used in the compiler. |
| enum DATA_SHARING_SIZES { |
| // The maximum number of workers in a kernel. |
| DS_Max_Worker_Threads = 992, |
| // The size reserved for data in a shared memory slot. |
| DS_Slot_Size = 256, |
| // The slot size that should be reserved for a working warp. |
| DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, |
| // The maximum number of warps in use |
| DS_Max_Warp_Number = 32, |
| // The size of the preallocated shared memory buffer per team |
| DS_Shared_Memory_Size = 128, |
| }; |
| |
| // Data structure to keep in shared memory that traces the current slot, stack, |
| // and frame pointer as well as the active threads that didn't exit the current |
| // environment. |
| struct DataSharingStateTy { |
| __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; |
| void *StackPtr[DS_Max_Warp_Number]; |
| void * volatile FramePtr[DS_Max_Warp_Number]; |
| int32_t ActiveThreads[DS_Max_Warp_Number]; |
| }; |
| // Additional worker slot type which is initialized with the default worker slot |
| // size of 4*32 bytes. |
| struct __kmpc_data_sharing_worker_slot_static { |
| __kmpc_data_sharing_slot *Next; |
| __kmpc_data_sharing_slot *Prev; |
| void *PrevSlotStackPtr; |
| void *DataEnd; |
| char Data[DS_Worker_Warp_Slot_Size]; |
| }; |
| // Additional master slot type which is initialized with the default master slot |
| // size of 4 bytes. |
| struct __kmpc_data_sharing_master_slot_static { |
| __kmpc_data_sharing_slot *Next; |
| __kmpc_data_sharing_slot *Prev; |
| void *PrevSlotStackPtr; |
| void *DataEnd; |
| char Data[DS_Slot_Size]; |
| }; |
| extern __device__ __shared__ DataSharingStateTy DataSharingState; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // task ICV and (implicit & explicit) task state |
| |
| class omptarget_nvptx_TaskDescr { |
| public: |
| // methods for flags |
| INLINE omp_sched_t GetRuntimeSched() const; |
| INLINE void SetRuntimeSched(omp_sched_t sched); |
| INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } |
| INLINE int InL2OrHigherParallelRegion() const { |
| return items.flags & TaskDescr_InParL2P; |
| } |
| INLINE int IsParallelConstruct() const { |
| return items.flags & TaskDescr_IsParConstr; |
| } |
| INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } |
| // methods for other fields |
| INLINE uint16_t &ThreadId() { return items.threadId; } |
| INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } |
| INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } |
| INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { |
| prev = taskDescr; |
| } |
| // init & copy |
| INLINE void InitLevelZeroTaskDescr(); |
| INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); |
| INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); |
| INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); |
| INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); |
| INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); |
| INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, |
| uint16_t tid, uint16_t tnum); |
| INLINE void SaveLoopData(); |
| INLINE void RestoreLoopData() const; |
| |
| private: |
| // bits for flags: (6 used, 2 free) |
| // 3 bits (SchedMask) for runtime schedule |
| // 1 bit (InPar) if this thread has encountered one or more parallel region |
| // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) |
| // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel |
| // region |
| static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); |
| static const uint8_t TaskDescr_InPar = 0x10; |
| static const uint8_t TaskDescr_IsParConstr = 0x20; |
| static const uint8_t TaskDescr_InParL2P = 0x40; |
| |
| struct SavedLoopDescr_items { |
| int64_t loopUpperBound; |
| int64_t nextLowerBound; |
| int64_t chunk; |
| int64_t stride; |
| kmp_sched_t schedule; |
| } loopData; |
| |
| struct TaskDescr_items { |
| uint8_t flags; // 6 bit used (see flag above) |
| uint8_t unused; |
| uint16_t threadId; // thread id |
| uint64_t runtimeChunkSize; // runtime chunk size |
| } items; |
| omptarget_nvptx_TaskDescr *prev; |
| }; |
| |
| // build on kmp |
| typedef struct omptarget_nvptx_ExplicitTaskDescr { |
| omptarget_nvptx_TaskDescr |
| taskDescr; // omptarget_nvptx task description (must be first) |
| kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) |
| } omptarget_nvptx_ExplicitTaskDescr; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Descriptor of a parallel region (worksharing in general) |
| |
| class omptarget_nvptx_WorkDescr { |
| |
| public: |
| // access to data |
| INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } |
| |
| private: |
| omptarget_nvptx_TaskDescr masterTaskICV; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| class omptarget_nvptx_TeamDescr { |
| public: |
| // access to data |
| INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { |
| return &levelZeroTaskDescr; |
| } |
| INLINE omptarget_nvptx_WorkDescr &WorkDescr() { |
| return workDescrForActiveParallel; |
| } |
| INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } |
| |
| // init |
| INLINE void InitTeamDescr(); |
| |
| INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { |
| // If this is invoked by the master thread of the master warp then intialize |
| // it with a smaller slot. |
| if (IsMasterThread) { |
| // Do not initalize this slot again if it has already been initalized. |
| if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) |
| return 0; |
| // Initialize the pointer to the end of the slot given the size of the |
| // data section. DataEnd is non-inclusive. |
| master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; |
| // We currently do not have a next slot. |
| master_rootS[0].Next = 0; |
| master_rootS[0].Prev = 0; |
| master_rootS[0].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&master_rootS[0]; |
| } |
| // Do not initalize this slot again if it has already been initalized. |
| if (worker_rootS[wid].DataEnd == |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) |
| return 0; |
| // Initialize the pointer to the end of the slot given the size of the data |
| // section. DataEnd is non-inclusive. |
| worker_rootS[wid].DataEnd = |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; |
| // We currently do not have a next slot. |
| worker_rootS[wid].Next = 0; |
| worker_rootS[wid].Prev = 0; |
| worker_rootS[wid].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; |
| } |
| |
| INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { |
| worker_rootS[wid].DataEnd = |
| &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; |
| // We currently do not have a next slot. |
| worker_rootS[wid].Next = 0; |
| worker_rootS[wid].Prev = 0; |
| worker_rootS[wid].PrevSlotStackPtr = 0; |
| return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; |
| } |
| |
| private: |
| omptarget_nvptx_TaskDescr |
| levelZeroTaskDescr; // icv for team master initial thread |
| omptarget_nvptx_WorkDescr |
| workDescrForActiveParallel; // one, ONLY for the active par |
| uint64_t lastprivateIterBuffer; |
| |
| __align__(16) |
| __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; |
| __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // thread private data (struct of arrays for better coalescing) |
| // tid refers here to the global thread id |
| // do not support multiple concurrent kernel a this time |
| class omptarget_nvptx_ThreadPrivateContext { |
| public: |
| // task |
| INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { |
| return &levelOneTaskDescr[tid]; |
| } |
| INLINE void SetTopLevelTaskDescr(int tid, |
| omptarget_nvptx_TaskDescr *taskICV) { |
| topTaskDescr[tid] = taskICV; |
| } |
| INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; |
| // parallel |
| INLINE uint16_t &NumThreadsForNextParallel(int tid) { |
| return nextRegion.tnum[tid]; |
| } |
| // simd |
| INLINE uint16_t &SimdLimitForNextSimd(int tid) { |
| return nextRegion.slim[tid]; |
| } |
| // schedule (for dispatch) |
| INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } |
| INLINE int64_t &Chunk(int tid) { return chunk[tid]; } |
| INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } |
| INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } |
| INLINE int64_t &Stride(int tid) { return stride[tid]; } |
| |
| INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } |
| |
| INLINE void InitThreadPrivateContext(int tid); |
| INLINE uint64_t &Cnt() { return cnt; } |
| |
| private: |
| // team context for this team |
| omptarget_nvptx_TeamDescr teamContext; |
| // task ICV for implict threads in the only parallel region |
| omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; |
| // pointer where to find the current task ICV (top of the stack) |
| omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; |
| union { |
| // Only one of the two is live at the same time. |
| // parallel |
| uint16_t tnum[MAX_THREADS_PER_TEAM]; |
| // simd limit |
| uint16_t slim[MAX_THREADS_PER_TEAM]; |
| } nextRegion; |
| // schedule (for dispatch) |
| kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for |
| int64_t chunk[MAX_THREADS_PER_TEAM]; |
| int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; |
| // state for dispatch with dyn/guided OR static (never use both at a time) |
| int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; |
| int64_t stride[MAX_THREADS_PER_TEAM]; |
| uint64_t cnt; |
| }; |
| |
| /// Device envrionment data |
| struct omptarget_device_environmentTy { |
| int32_t debug_level; |
| }; |
| |
| /// Memory manager for statically allocated memory. |
| class omptarget_nvptx_SimpleMemoryManager { |
| private: |
| __align__(128) struct MemDataTy { |
| volatile unsigned keys[OMP_STATE_COUNT]; |
| } MemData[MAX_SM]; |
| |
| INLINE static uint32_t hash(unsigned key) { |
| return key & (OMP_STATE_COUNT - 1); |
| } |
| |
| public: |
| INLINE void Release(); |
| INLINE const void *Acquire(const void *buf, size_t size); |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // global device envrionment |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| extern __device__ omptarget_device_environmentTy omptarget_device_environment; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // global data tables |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| extern __device__ omptarget_nvptx_SimpleMemoryManager |
| omptarget_nvptx_simpleMemoryManager; |
| extern __device__ __shared__ uint32_t usedMemIdx; |
| extern __device__ __shared__ uint32_t usedSlotIdx; |
| extern __device__ __shared__ uint8_t |
| parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; |
| extern __device__ __shared__ uint16_t threadLimit; |
| extern __device__ __shared__ uint16_t threadsInTeam; |
| extern __device__ __shared__ uint16_t nThreads; |
| extern __device__ __shared__ |
| omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; |
| |
| extern __device__ __shared__ uint32_t execution_param; |
| extern __device__ __shared__ void *ReductionScratchpadPtr; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // work function (outlined parallel/simd functions) and arguments. |
| // needed for L1 parallelism only. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| typedef void *omptarget_nvptx_WorkFn; |
| extern volatile __device__ __shared__ omptarget_nvptx_WorkFn |
| omptarget_nvptx_workFn; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // get private data structures |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); |
| INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); |
| INLINE omptarget_nvptx_TaskDescr * |
| getMyTopTaskDescriptor(bool isSPMDExecutionMode); |
| INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // inlined implementation |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| #include "omptarget-nvptxi.h" |
| #include "supporti.h" |
| |
| #endif |