Revert "[OpenMP] Remove noinline attributes in the device runtime"

The behaviour of this patch is not great, but it has some side-effects
that are required for OpenMPOpt to work. The problem is that when we use
`-mlink-builtin-bitcode` we only import used symbols from the runtime.
Then OpenMPOpt will insert calls to symbols that were not previously
included. This patch removed this implicit behaviour as these functions
were kept alive by the `noinline` simply because it kept calls to them
in the module. This caused regression in some tests that relied on some
OpenMPOpt passes without using LTO. Reverting for the LLVM15 release but
will try to fix it more correctly on main.

This reverts commit d61d72dae604c3258e25c00622b1a85861450303.

Fixes #56752

(cherry picked from commit b08369f7f288b6efb0897953da42ed54e60cfc0b)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index ef2384f..0b42fc1 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -499,6 +499,18 @@
   }
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
+    // Remove the `noinline` attribute from `__kmpc`, `_OMP::` and `omp_`
+    // functions, except if `optnone` is present.
+    if (isOpenMPDevice(M)) {
+      for (Function &F : M) {
+        for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"})
+          if (F.hasFnAttribute(Attribute::NoInline) &&
+              F.getName().startswith(Prefix) &&
+              !F.hasFnAttribute(Attribute::OptimizeNone))
+            F.removeFnAttr(Attribute::NoInline);
+      }
+    }
+
     // TODO: We should attach the attributes defined in OMPKinds.def.
   }
 
diff --git a/llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll b/llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll
new file mode 100644
index 0000000..349e279
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/remove_noinline_attributes.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes
+; RUN: opt < %s -S -openmp-opt-cgscc        | FileCheck %s
+; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s
+
+declare void @unknown()
+
+; __kmpc functions
+define void @__kmpc_noinline() noinline nounwind {
+; CHECK: Function Attrs: nounwind
+; CHECK-LABEL: @__kmpc_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+; omp_X functions
+define void @omp_noinline() noinline nounwind {
+; CHECK: Function Attrs: nounwind
+; CHECK-LABEL: @omp_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+; _OMP namespace
+define void @_ZN4_OMP_noinline() noinline nounwind {
+; CHECK: Function Attrs: nounwind
+; CHECK-LABEL: @_ZN4_OMP_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+; Negative tests:
+
+define void @__kmpc_noinline_optnone() noinline optnone nounwind {
+; CHECK: Function Attrs: noinline nounwind optnone
+; CHECK-LABEL: @__kmpc_noinline_optnone(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+define void @omp_noinline_optnone() noinline optnone nounwind {
+; CHECK: Function Attrs: noinline nounwind optnone
+; CHECK-LABEL: @omp_noinline_optnone(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+; _OMP namespace
+define void @_ZN4_OMP_noinline_optnone() noinline optnone nounwind {
+; CHECK: Function Attrs: noinline nounwind optnone
+; CHECK-LABEL: @_ZN4_OMP_noinline_optnone(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+define void @a___kmpc_noinline() noinline nounwind {
+; CHECK: Function Attrs: noinline nounwind
+; CHECK-LABEL: @a___kmpc_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+define void @a_omp_noinline() noinline nounwind {
+; CHECK: Function Attrs: noinline nounwind
+; CHECK-LABEL: @a_omp_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+define void @a__ZN4_OMP_noinline() noinline nounwind {
+; CHECK: Function Attrs: noinline nounwind
+; CHECK-LABEL: @a__ZN4_OMP_noinline(
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 7, !"openmp", i32 50}
+!1 = !{i32 7, !"openmp-device", i32 50}
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 4b8898f..e33f37a 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -29,13 +29,15 @@
 
 /// Synchronizing threads is allowed even if they all hit different instances of
 /// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance.
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
 ///{
 #pragma omp begin assumes ext_aligned_barrier
 
 /// Synchronize all threads in a block, they are are reaching the same
 /// instruction (hence all threads in the block are "aligned").
-void threadsAligned();
+__attribute__((noinline)) void threadsAligned();
 
 #pragma omp end assumes
 ///}
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index b161c55..172bbbf 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -289,17 +289,17 @@
 ///}
 
 extern "C" {
-uint32_t __kmpc_get_hardware_thread_id_in_block() {
+__attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
   FunctionTracingRAII();
   return mapping::getThreadIdInBlock();
 }
 
-uint32_t __kmpc_get_hardware_num_threads_in_block() {
+__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
   FunctionTracingRAII();
   return impl::getNumHardwareThreadsInBlock();
 }
 
-uint32_t __kmpc_get_warp_size() {
+__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
   FunctionTracingRAII();
   return impl::getWarpSize();
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 5b133b0..27d1ff2 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -243,7 +243,8 @@
     __kmpc_end_sharing_variables();
 }
 
-bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+__attribute__((noinline)) bool
+__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
   FunctionTracingRAII();
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;
@@ -258,7 +259,7 @@
   return ThreadIsActive;
 }
 
-void __kmpc_kernel_end_parallel() {
+__attribute__((noinline)) void __kmpc_kernel_end_parallel() {
   FunctionTracingRAII();
   // In case we have modified an ICV for this thread before a ThreadState was
   // created. We drop it now to not contaminate the next parallel region.
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 92847f7..7a73330 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -393,12 +393,12 @@
 }
 
 extern "C" {
-void *__kmpc_alloc_shared(uint64_t Bytes) {
+__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
   FunctionTracingRAII();
   return memory::allocShared(Bytes, "Frontend alloc shared");
 }
 
-void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+__attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
   FunctionTracingRAII();
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 350da0b..4327871 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -358,12 +358,14 @@
   impl::namedBarrier();
 }
 
-void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
+__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
+                                                          int32_t TId) {
   FunctionTracingRAII();
   synchronize::threadsAligned();
 }
 
-void __kmpc_barrier_simple_generic(IdentTy *Loc, int32_t TId) {
+__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
+                                                             int32_t TId) {
   FunctionTracingRAII();
   synchronize::threads();
 }