[OpenMP] Fix main thread barrier for Pascal and amdgpu
Fixes what's left of https://bugs.llvm.org/show_bug.cgi?id=51781.
Reviewed By: jdoerfert, JonChesterfield, tianshilei1992
Differential Revision: https://reviews.llvm.org/D113602
GitOrigin-RevId: c9dfe322eefca14ce07f73452f7327ecda57da30
diff --git a/libomptarget/DeviceRTL/src/Kernel.cpp b/libomptarget/DeviceRTL/src/Kernel.cpp
index bf3d4ca..65b554b 100644
--- a/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -86,7 +86,21 @@
if (mapping::isInitialThreadInLevel0(IsSPMD))
return -1;
- if (UseGenericStateMachine)
+ // Enter the generic state machine if enabled and if this thread can possibly
+ // be an active worker thread.
+ //
+ // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
+ // GPU. In those cases, a single thread can apparently satisfy a barrier on
+ // behalf of all threads in the same warp. Thus, it would not be safe for
+ // other threads in the main thread's warp to reach the first
+ // synchronize::threads call in genericStateMachine before the main thread
+ // reaches its corresponding synchronize::threads call: that would permit all
+ // active worker threads to proceed before the main thread has actually set
+ // state::ParallelRegionFn, and then they would immediately quit without
+ // doing any work. mapping::getBlockSize() does not include any of the main
+ // thread's warp, so none of its threads can ever be active worker threads.
+ if (UseGenericStateMachine &&
+ mapping::getThreadIdInBlock() < mapping::getBlockSize())
genericStateMachine(Ident);
return mapping::getThreadIdInBlock();
diff --git a/libomptarget/deviceRTLs/common/src/omptarget.cu b/libomptarget/deviceRTLs/common/src/omptarget.cu
index d0be541..8862026 100644
--- a/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -225,7 +225,22 @@
if (TId == GetMasterThreadID())
return -1;
- if (UseGenericStateMachine)
+ // Enter the generic state machine if enabled and if this thread can possibly
+ // be an active worker thread.
+ //
+ // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
+ // GPU. In those cases, a single thread can apparently satisfy a barrier on
+ // behalf of all threads in the same warp. Thus, it would not be safe for
+ // other threads in the main thread's warp to reach the first
+ // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
+ // before the main thread reaches its corresponding
+ // __kmpc_barrier_simple_spmd call: that would permit all active worker
+ // threads to proceed before the main thread has actually set
+ // omptarget_nvptx_workFn, and then they would immediately quit without
+ // doing any work. GetNumberOfWorkersInTeam() does not include any of the
+ // main thread's warp, so none of its threads can ever be active worker
+ // threads.
+ if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
__kmpc_target_region_state_machine(Ident);
return TId;
diff --git a/libomptarget/deviceRTLs/common/src/support.cu b/libomptarget/deviceRTLs/common/src/support.cu
index b3bf550..47969a3 100644
--- a/libomptarget/deviceRTLs/common/src/support.cu
+++ b/libomptarget/deviceRTLs/common/src/support.cu
@@ -231,6 +231,7 @@
__attribute__((used, weak, optnone)) void keepAlive() {
__kmpc_get_hardware_thread_id_in_block();
__kmpc_get_hardware_num_threads_in_block();
+ __kmpc_get_warp_size();
__kmpc_barrier_simple_spmd(nullptr, 0);
__kmpc_barrier_simple_generic(nullptr, 0);
}
diff --git a/libomptarget/deviceRTLs/target_interface.h b/libomptarget/deviceRTLs/target_interface.h
index 8a1a2e4..94c92eb 100644
--- a/libomptarget/deviceRTLs/target_interface.h
+++ b/libomptarget/deviceRTLs/target_interface.h
@@ -23,6 +23,7 @@
EXTERN int GetBlockIdInKernel();
EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
+EXTERN unsigned __kmpc_get_warp_size();
EXTERN unsigned GetWarpId();
EXTERN unsigned GetWarpSize();
EXTERN unsigned GetLaneId();
diff --git a/libomptarget/test/offloading/bug51781.c b/libomptarget/test/offloading/bug51781.c
new file mode 100644
index 0000000..14036db
--- /dev/null
+++ b/libomptarget/test/offloading/bug51781.c
@@ -0,0 +1,38 @@
+// Use the generic state machine. On some architectures, other threads in the
+// main thread's warp must avoid barrier instructions.
+//
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// SPMDize. There is no main thread, so there's no issue.
+//
+// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1
+// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd
+// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// SPMD: Transformed generic-mode kernel to SPMD-mode.
+
+// Use the custom state machine, which must avoid the same barrier problem as
+// the generic state machine.
+//
+// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \
+// RUN: -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
+// RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
+// RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// CUSTOM: Rewriting generic-mode kernel with a customized state machine.
+
+#include <stdio.h>
+int main() {
+ int x = 0, y = 1;
+ #pragma omp target teams num_teams(1) map(tofrom:x, y)
+ {
+ x = 5;
+ #pragma omp parallel
+ y = 6;
+ }
+ // CHECK: 5, 6
+ printf("%d, %d\n", x, y);
+ return 0;
+}