[OpenMP][libomptarget] Use shared memory variable for tracking parallel level
Summary: Replace existing infrastructure for tracking parallel level using global memory with a per-team shared memory variable. This minimizes the impact of the overhead of tracking the parallel level for non-nested cases.
Reviewers: ABataev, caomhin
Reviewed By: ABataev
Subscribers: guansong, openmp-commits
Differential Revision: https://reviews.llvm.org/D55773
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350747 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 9abe599..63bf6b4 100644
--- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -165,7 +165,7 @@
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
- return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ return parallelLevel;
}
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
diff --git a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index 35f94ac..0700577 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -27,22 +27,17 @@
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
-__device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
- OMP_STATE_COUNT>
- omptarget_nvptx_device_simpleState[MAX_SM];
-
__device__ omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
+__device__ __shared__ uint8_t parallelLevel;
+
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
- *omptarget_nvptx_simpleThreadPrivateContext;
-
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 2a3d49c..7034d02 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -21,10 +21,6 @@
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
-extern __device__ omptarget_nvptx_Queue<
- omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
- omptarget_nvptx_device_simpleState[MAX_SM];
-
////////////////////////////////////////////////////////////////////////////////
// init entry points
////////////////////////////////////////////////////////////////////////////////
@@ -100,14 +96,10 @@
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
- int slot = smid() % MAX_SM;
- usedSlotIdx = slot;
- omptarget_nvptx_simpleThreadPrivateContext =
- omptarget_nvptx_device_simpleState[slot].Dequeue();
+ parallelLevel = 0;
+ usedSlotIdx = smid() % MAX_SM;
}
- // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
- omptarget_nvptx_simpleThreadPrivateContext->Init();
return;
}
setExecutionParameters(Spmd, RuntimeInitialized);
@@ -172,18 +164,12 @@
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
// We're not going to pop the task descr stack of each thread since
// there are no more parallel regions in SPMD mode.
+ if (!RequiresOMPRuntime)
+ return;
+
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
int threadId = GetThreadIdInBlock();
- if (!RequiresOMPRuntime) {
- if (threadId == 0) {
- // Enqueue omp state object for use by another team.
- int slot = usedSlotIdx;
- omptarget_nvptx_device_simpleState[slot].Enqueue(
- omptarget_nvptx_simpleThreadPrivateContext);
- }
- return;
- }
if (threadId == 0) {
// Enqueue omp state object for use by another team.
int slot = usedSlotIdx;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index cb6c0b7..d23010e 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -391,39 +391,6 @@
INLINE const void *Acquire(const void *buf, size_t size);
};
-class omptarget_nvptx_SimpleThreadPrivateContext {
- uint16_t par_level[MAX_THREADS_PER_TEAM];
-
-public:
- INLINE void Init() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- par_level[GetThreadIdInBlock()] = 0;
- }
- INLINE void IncParLevel() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- ++par_level[GetThreadIdInBlock()];
- }
- INLINE void DecParLevel() {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- ASSERT0(LT_FUSSY, par_level[GetThreadIdInBlock()] > 0,
- "Expected parallel level >0.");
- --par_level[GetThreadIdInBlock()];
- }
- INLINE bool InL2OrHigherParallelRegion() const {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- return par_level[GetThreadIdInBlock()] > 0;
- }
- INLINE uint16_t GetParallelLevel() const {
- ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
- "Expected SPMD + uninitialized runtime modes.");
- return par_level[GetThreadIdInBlock()] + 1;
- }
-};
-
////////////////////////////////////////////////////////////////////////////////
// global device envrionment
////////////////////////////////////////////////////////////////////////////////
@@ -440,10 +407,9 @@
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
+extern __device__ __shared__ uint8_t parallelLevel;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
- *omptarget_nvptx_simpleThreadPrivateContext;
extern __device__ __shared__ uint32_t execution_param;
extern __device__ __shared__ void *ReductionScratchpadPtr;
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index c5edd31..8de8f59 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -340,7 +340,11 @@
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- omptarget_nvptx_simpleThreadPrivateContext->IncParLevel();
+ __SYNCTHREADS();
+ if (GetThreadIdInBlock() == 0)
+ ++parallelLevel;
+ __SYNCTHREADS();
+
return;
}
@@ -379,7 +383,10 @@
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- omptarget_nvptx_simpleThreadPrivateContext->DecParLevel();
+ __SYNCTHREADS();
+ if (GetThreadIdInBlock() == 0)
+ --parallelLevel;
+ __SYNCTHREADS();
return;
}
@@ -401,7 +408,7 @@
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+ return parallelLevel;
}
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index ece3295..b8f661c 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -155,8 +155,7 @@
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (omptarget_nvptx_simpleThreadPrivateContext
- ->InL2OrHigherParallelRegion())
+ if (parallelLevel > 0)
rc = 0;
else
rc = GetThreadIdInBlock();
@@ -177,8 +176,7 @@
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (omptarget_nvptx_simpleThreadPrivateContext
- ->InL2OrHigherParallelRegion())
+ if (parallelLevel > 0)
rc = 1;
else
rc = GetNumberOfThreadsInBlock();