[OpenMP] Introduce aligned synchronization into the new device RT
We will later use the fact that a barrier is aligned to reason about
thread divergence. For now we introduce the assumption and some more
documentation.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D112153
GitOrigin-RevId: b16aadf0a79a2796c9ac68d1bfc8853ccf25c459
diff --git a/libomptarget/DeviceRTL/include/Synchronization.h b/libomptarget/DeviceRTL/include/Synchronization.h
index 7097056..e33f37a 100644
--- a/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/libomptarget/DeviceRTL/include/Synchronization.h
@@ -27,6 +27,21 @@
/// Synchronize all threads in a block.
void threads();
+/// Synchronizing threads is allowed even if they all hit different instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
} // namespace synchronize
namespace fence {
diff --git a/libomptarget/DeviceRTL/src/Kernel.cpp b/libomptarget/DeviceRTL/src/Kernel.cpp
index f834754..d47fa03 100644
--- a/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -69,7 +69,7 @@
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
if (IsSPMD) {
inititializeRuntime(/* IsSPMD */ true);
- synchronize::threads();
+ synchronize::threadsAligned();
} else {
inititializeRuntime(/* IsSPMD */ false);
// No need to wait since only the main threads will execute user
diff --git a/libomptarget/DeviceRTL/src/Parallelism.cpp b/libomptarget/DeviceRTL/src/Parallelism.cpp
index 91309b8..81af773 100644
--- a/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -93,18 +93,36 @@
uint32_t NumThreads = determineNumberOfThreads(num_threads);
if (mapping::isSPMDMode()) {
- synchronize::threads();
+ // Avoid the race between the read of the `icv::Level` above and the write
+ // below by synchronizing all threads here.
+ synchronize::threadsAligned();
{
+ // Note that the order here is important. `icv::Level` has to be updated
+ // last or the other updates will cause a thread specific state to be
+ // created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, TId == 0);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
- synchronize::threads();
+
+ // Synchronize all threads after the main thread (TId == 0) set up the
+ // team state properly.
+ synchronize::threadsAligned();
+
+ ASSERT(state::ParallelTeamSize == NumThreads);
+ ASSERT(icv::ActiveLevel == 1u);
+ ASSERT(icv::Level == 1u);
if (TId < NumThreads)
invokeMicrotask(TId, 0, fn, args, nargs);
- synchronize::threads();
+
+ // Synchronize all threads at the end of a parallel region.
+ synchronize::threadsAligned();
}
+
+ ASSERT(state::ParallelTeamSize == 1u);
+ ASSERT(icv::ActiveLevel == 0u);
+ ASSERT(icv::Level == 0u);
return;
}
@@ -130,6 +148,9 @@
}
{
+ // Note that the order here is important. `icv::Level` has to be updated
+ // last or the other updates will cause a thread specific state to be
+ // created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
diff --git a/libomptarget/DeviceRTL/src/State.cpp b/libomptarget/DeviceRTL/src/State.cpp
index ae6a93e..f39b61c 100644
--- a/libomptarget/DeviceRTL/src/State.cpp
+++ b/libomptarget/DeviceRTL/src/State.cpp
@@ -41,8 +41,8 @@
///{
extern "C" {
-void *malloc(uint64_t Size);
-void free(void *Ptr);
+__attribute__((leaf)) void *malloc(uint64_t Size);
+__attribute__((leaf)) void free(void *Ptr);
}
///}
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index 17a91de..c77e766 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -132,6 +132,8 @@
void syncThreads() { __builtin_amdgcn_s_barrier(); }
+void syncThreadsAligned() { syncThreads(); }
+
void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
@@ -179,6 +181,8 @@
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
}
+void syncThreadsAligned() { __syncthreads(); }
+
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
constexpr uint32_t SET = 1;
@@ -227,6 +231,8 @@
void synchronize::threads() { impl::syncThreads(); }
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+
void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
@@ -238,7 +244,7 @@
}
void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
- impl::atomicStore(Addr, V, Ordering);
+ impl::atomicStore(Addr, V, Ordering);
}
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -275,7 +281,7 @@
__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
int32_t TId) {
- synchronize::threads();
+ synchronize::threadsAligned();
}
int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {