[OpenMP][FIX] Introduce and use a simple generic-mode barrier

Before we had aligned barriers the `__kmpc_barrier_simple_spmd` was
OK to be used in the custom state machine. Now that SPMD barriers are
assumed to be aligned we need to use a "generic" barrier in places
that are not aligned.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D112893

GitOrigin-RevId: 73720c8059cfcce12f0cc5b7e6ff2e4b635a9a61
diff --git a/libomptarget/DeviceRTL/include/Interface.h b/libomptarget/DeviceRTL/include/Interface.h
index da04e14..302e3eb 100644
--- a/libomptarget/DeviceRTL/include/Interface.h
+++ b/libomptarget/DeviceRTL/include/Interface.h
@@ -249,6 +249,8 @@
 
 void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
 
+void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId);
+
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
 
 void __kmpc_end_master(IdentTy *Loc, int32_t TId);
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index e219c75..6b4bab0 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -348,6 +348,12 @@
   synchronize::threadsAligned();
 }
 
+__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
+                                                             int32_t TId) {
+  FunctionTracingRAII();
+  synchronize::threads();
+}
+
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
   FunctionTracingRAII();
   return omp_get_team_num() == 0;
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index 8fcb96b..df57497 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -25,6 +25,7 @@
   __kmpc_get_hardware_thread_id_in_block();
   __kmpc_get_hardware_num_threads_in_block();
   __kmpc_barrier_simple_spmd(nullptr, 0);
+  __kmpc_barrier_simple_generic(nullptr, 0);
 }
 } // namespace _OMP
 
diff --git a/libomptarget/deviceRTLs/common/src/sync.cu b/libomptarget/deviceRTLs/common/src/sync.cu
index 8711cd2..823c9fc 100644
--- a/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/libomptarget/deviceRTLs/common/src/sync.cu
@@ -78,6 +78,9 @@
   __kmpc_impl_syncthreads();
   PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
 }
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
+  return __kmpc_barrier_simple_spmd(loc_ref, tid);
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // KMP MASTER
diff --git a/libomptarget/deviceRTLs/interface.h b/libomptarget/deviceRTLs/interface.h
index cb193c9..00aa07c 100644
--- a/libomptarget/deviceRTLs/interface.h
+++ b/libomptarget/deviceRTLs/interface.h
@@ -380,6 +380,7 @@
 // sync barrier
 EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
 EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
 
 // single