[OpenMP] Introduce aligned synchronization into the new device RT

We will later use the fact that a barrier is aligned to reason about
thread divergence. For now we introduce the assumption and some more
documentation.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D112153

GitOrigin-RevId: b16aadf0a79a2796c9ac68d1bfc8853ccf25c459
diff --git a/libomptarget/DeviceRTL/include/Synchronization.h b/libomptarget/DeviceRTL/include/Synchronization.h
index 7097056..e33f37a 100644
--- a/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/libomptarget/DeviceRTL/include/Synchronization.h
@@ -27,6 +27,21 @@
 /// Synchronize all threads in a block.
 void threads();
 
+/// Synchronizing threads is allowed even if they all hit different instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
 } // namespace synchronize
 
 namespace fence {
diff --git a/libomptarget/DeviceRTL/src/Kernel.cpp b/libomptarget/DeviceRTL/src/Kernel.cpp
index f834754..d47fa03 100644
--- a/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -69,7 +69,7 @@
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threads();
+    synchronize::threadsAligned();
   } else {
     inititializeRuntime(/* IsSPMD */ false);
     // No need to wait since only the main threads will execute user
diff --git a/libomptarget/DeviceRTL/src/Parallelism.cpp b/libomptarget/DeviceRTL/src/Parallelism.cpp
index 91309b8..81af773 100644
--- a/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -93,18 +93,36 @@
 
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
   if (mapping::isSPMDMode()) {
-    synchronize::threads();
+    // Avoid the race between the read of the `icv::Level` above and the write
+    // below by synchronizing all threads here.
+    synchronize::threadsAligned();
     {
+      // Note that the order here is important. `icv::Level` has to be updated
+      // last or the other updates will cause a thread specific state to be
+      // created.
       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                             1u, TId == 0);
       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
-      synchronize::threads();
+
+      // Synchronize all threads after the main thread (TId == 0) set up the
+      // team state properly.
+      synchronize::threadsAligned();
+
+      ASSERT(state::ParallelTeamSize == NumThreads);
+      ASSERT(icv::ActiveLevel == 1u);
+      ASSERT(icv::Level == 1u);
 
       if (TId < NumThreads)
         invokeMicrotask(TId, 0, fn, args, nargs);
-      synchronize::threads();
+
+      // Synchronize all threads at the end of a parallel region.
+      synchronize::threadsAligned();
     }
+
+    ASSERT(state::ParallelTeamSize == 1u);
+    ASSERT(icv::ActiveLevel == 0u);
+    ASSERT(icv::Level == 0u);
     return;
   }
 
@@ -130,6 +148,9 @@
   }
 
   {
+    // Note that the order here is important. `icv::Level` has to be updated
+    // last or the other updates will cause a thread specific state to be
+    // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                           1u, true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
diff --git a/libomptarget/DeviceRTL/src/State.cpp b/libomptarget/DeviceRTL/src/State.cpp
index ae6a93e..f39b61c 100644
--- a/libomptarget/DeviceRTL/src/State.cpp
+++ b/libomptarget/DeviceRTL/src/State.cpp
@@ -41,8 +41,8 @@
 ///{
 
 extern "C" {
-void *malloc(uint64_t Size);
-void free(void *Ptr);
+__attribute__((leaf)) void *malloc(uint64_t Size);
+__attribute__((leaf)) void free(void *Ptr);
 }
 
 ///}
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index 17a91de..c77e766 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -132,6 +132,8 @@
 
 void syncThreads() { __builtin_amdgcn_s_barrier(); }
 
+void syncThreadsAligned() { syncThreads(); }
+
 void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
 
 void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
@@ -179,6 +181,8 @@
   asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
 }
 
+void syncThreadsAligned() { __syncthreads(); }
+
 constexpr uint32_t OMP_SPIN = 1000;
 constexpr uint32_t UNSET = 0;
 constexpr uint32_t SET = 1;
@@ -227,6 +231,8 @@
 
 void synchronize::threads() { impl::syncThreads(); }
 
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+
 void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
 
 void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
@@ -238,7 +244,7 @@
 }
 
 void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
-   impl::atomicStore(Addr, V, Ordering);
+  impl::atomicStore(Addr, V, Ordering);
 }
 
 uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -275,7 +281,7 @@
 
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
-  synchronize::threads();
+  synchronize::threadsAligned();
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {