[libomptarget] Refactor activemask macro to inline function
Summary:
[libomptarget] Refactor activemask macro to inline function
See also abandoned D66846, split into this diff and others.
Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers
Reviewed By: jdoerfert, ABataev
Subscribers: openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D66851
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370781 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 5463adc..bd4cfec 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -20,7 +20,7 @@
// Return true if this is the first active thread in the warp.
INLINE static bool IsWarpMasterActiveThread() {
- unsigned long long Mask = __ACTIVEMASK();
+ unsigned long long Mask = __kmpc_impl_activemask();
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
unsigned long long Sh = Mask << ShNum;
// Truncate Sh to the 32 lower bits
@@ -112,7 +112,7 @@
(unsigned long long)SharingDefaultDataSize);
unsigned WID = getWarpId();
- unsigned CurActiveThreads = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
@@ -252,7 +252,7 @@
return;
}
- int32_t CurActive = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
// Only the warp master can restore the stack and frame information, and only
// if there are no other threads left behind in this environment (i.e. the
@@ -378,7 +378,7 @@
// Frame pointer must be visible to all workers in the same warp.
const unsigned WID = getWarpId();
void *FrameP = 0;
- int32_t CurActive = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 3ed44f7..f9a3015 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -389,7 +389,7 @@
}
INLINE static uint64_t NextIter() {
- __kmpc_impl_lanemask_t active = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
uint32_t leader = __kmpc_impl_ffs(active) - 1;
uint32_t change = __kmpc_impl_popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index f5eb00b..2299d24 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -45,17 +45,6 @@
#define BARRIER_COUNTER 0
#define ORDERED_COUNTER 1
-// Macros for Cuda intrinsics
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#elif CUDA_VERSION >= 9000
-#define __ACTIVEMASK() __activemask()
-#else
-#define __ACTIVEMASK() __ballot(1)
-#endif // CUDA_VERSION
-
// arguments needed for L0 parallelism only.
class omptarget_nvptx_SharedArgs {
public:
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index ee47cc4..d8049d9 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -53,7 +53,7 @@
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
- uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+ __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
@@ -126,7 +126,7 @@
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
- uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
+ __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index ea53f61..347c556 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -61,12 +61,12 @@
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
- uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
- uint32_t Liveness = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
+ __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
- uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
+ __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
do {
- Liveness = __ACTIVEMASK();
+ Liveness = __kmpc_impl_activemask();
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
size = __kmpc_impl_popc(Liveness);
logical_lane_id /= 2;
@@ -81,7 +81,7 @@
size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct) {
- uint32_t Liveness = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) {
gpu_regular_warp_reduce(reduce_data, shflFct);
return GetThreadIdInBlock() % WARPSIZE ==
@@ -142,7 +142,7 @@
}
return BlockThreadId == 0;
#else
- uint32_t Liveness = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
@@ -317,7 +317,7 @@
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
// Reduce across warps to the warp master.
- uint32_t Liveness = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
if (Liveness == 0xffffffff) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 884982d..1a0f1a9 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -204,9 +204,9 @@
// Parallel level
INLINE void IncParallelLevel(bool ActiveParallel) {
- unsigned Active = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
__kmpc_impl_syncwarp(Active);
- unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+ __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
@@ -217,9 +217,9 @@
}
INLINE void DecParallelLevel(bool ActiveParallel) {
- unsigned Active = __ACTIVEMASK();
+ __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
__kmpc_impl_syncwarp(Active);
- unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
+ __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 3ce695c..343293e 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -142,7 +142,7 @@
EXTERN int32_t __kmpc_warp_active_thread_mask() {
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
- return __ACTIVEMASK();
+ return __kmpc_impl_activemask();
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 8986195..8008143 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -48,6 +48,16 @@
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#endif
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+#if CUDA_VERSION >= 9000
+ return __activemask();
+#else
+ return __ballot(1);
+#endif
+}
+
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,