[Libomptarget] Remove global ctor and use reference counting (#80499)

Summary:
Currently we rely on global constructors to initialize and shut down the
OpenMP runtime library and plugin manager. This causes some issues
because we do not have a defined lifetime that we can rely on to release
and allocate resources. This patch instead adds some simple reference
counted initialization and deinitialization function.

A future patch will use the `deinit` interface to more intelligently
handle plugin deinitilization. Right now we do nothing and rely on
`atexit` inside of the plugins to tear them down. This isn't great
because it limits our ability to control these things.

Note that I made the `__tgt_register_lib` functions do the
initialization instead of adding calls to the new runtime functions in
the linker wrapper. The reason for this is because in the past it's been
easier to not introduce a new function call, since sometimes the user's
compiler will link against an older `libomptarget`. Maybe if we change
the name with offloading in the future we can simplify this.

Depends on https://github.com/llvm/llvm-project/pull/80460

GitOrigin-RevId: ea174c09342275d6c6fec48fb846eaf28fae5b51
diff --git a/libomptarget/include/PluginManager.h b/libomptarget/include/PluginManager.h
index ec5d98d..5e5306a 100644
--- a/libomptarget/include/PluginManager.h
+++ b/libomptarget/include/PluginManager.h
@@ -206,6 +206,12 @@
   ProtectedObj<DeviceContainerTy> Devices;
 };
 
+/// Initialize the plugin manager and OpenMP runtime.
+void initRuntime();
+
+/// Deinitialize the plugin and delete it.
+void deinitRuntime();
+
 extern PluginManager *PM;
 
 #endif // OMPTARGET_PLUGIN_MANAGER_H
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index c4faa23..9a2bd13 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -312,6 +312,12 @@
 /// add the clauses of the requires directives in a given file
 void __tgt_register_requires(int64_t Flags);
 
+/// Initializes the runtime library.
+void __tgt_rtl_init();
+
+/// Deinitializes the runtime library.
+void __tgt_rtl_deinit();
+
 /// adds a target shared library to the target execution image
 void __tgt_register_lib(__tgt_bin_desc *Desc);
 
diff --git a/libomptarget/src/OffloadRTL.cpp b/libomptarget/src/OffloadRTL.cpp
index 86ef0d5..dd75b1b 100644
--- a/libomptarget/src/OffloadRTL.cpp
+++ b/libomptarget/src/OffloadRTL.cpp
@@ -20,25 +20,39 @@
 extern void llvm::omp::target::ompt::connectLibrary();
 #endif
 
-__attribute__((constructor(101))) void init() {
+static std::mutex PluginMtx;
+static uint32_t RefCount = 0;
+
+void initRuntime() {
+  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
   Profiler::get();
   TIMESCOPE();
 
-  DP("Init offload library!\n");
+  if (PM == nullptr)
+    PM = new PluginManager();
 
-  PM = new PluginManager();
-
+  RefCount++;
+  if (RefCount == 1) {
+    DP("Init offload library!\n");
 #ifdef OMPT_SUPPORT
-  // Initialize OMPT first
-  llvm::omp::target::ompt::connectLibrary();
+    // Initialize OMPT first
+    llvm::omp::target::ompt::connectLibrary();
 #endif
 
-  PM->init();
-
-  PM->registerDelayedLibraries();
+    PM->init();
+    PM->registerDelayedLibraries();
+  }
 }
 
-__attribute__((destructor(101))) void deinit() {
-  DP("Deinit offload library!\n");
-  delete PM;
+void deinitRuntime() {
+  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
+  assert(PM && "Runtime not initialized");
+
+  if (RefCount == 1) {
+    DP("Deinit offload library!\n");
+    delete PM;
+    PM = nullptr;
+  }
+
+  RefCount--;
 }
diff --git a/libomptarget/src/PluginManager.cpp b/libomptarget/src/PluginManager.cpp
index 34f1f49..09f9c64 100644
--- a/libomptarget/src/PluginManager.cpp
+++ b/libomptarget/src/PluginManager.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 using namespace llvm::sys;
 
-PluginManager *PM;
+PluginManager *PM = nullptr;
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
diff --git a/libomptarget/src/exports b/libomptarget/src/exports
index af882a2..d5432a9 100644
--- a/libomptarget/src/exports
+++ b/libomptarget/src/exports
@@ -1,5 +1,7 @@
 VERS1.0 {
   global:
+    __tgt_rtl_init;
+    __tgt_rtl_deinit;
     __tgt_register_requires;
     __tgt_register_lib;
     __tgt_unregister_lib;
diff --git a/libomptarget/src/interface.cpp b/libomptarget/src/interface.cpp
index d2707f3..8b89bc3 100644
--- a/libomptarget/src/interface.cpp
+++ b/libomptarget/src/interface.cpp
@@ -38,9 +38,13 @@
           __PRETTY_FUNCTION__);
 }
 
+EXTERN void __tgt_rtl_init() { initRuntime(); }
+EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
+
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
+  initRuntime();
   if (PM->delayRegisterLib(Desc))
     return;
 
@@ -49,12 +53,17 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Initialize all available devices without registering any image
-EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
+EXTERN void __tgt_init_all_rtls() {
+  assert(PM && "Runtime not initialized");
+  PM->initAllPlugins();
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
   PM->unregisterLib(Desc);
+
+  deinitRuntime();
 }
 
 template <typename TargetAsyncInfoTy>
@@ -64,6 +73,7 @@
            map_var_info_t *ArgNames, void **ArgMappers,
            TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
            const char *RegionName) {
+  assert(PM && "Runtime not initialized");
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
@@ -239,6 +249,7 @@
 static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                int32_t ThreadLimit, void *HostPtr,
                                KernelArgsTy *KernelArgs) {
+  assert(PM && "Runtime not initialized");
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
@@ -345,6 +356,7 @@
                                         void *VAddr, bool IsRecord,
                                         bool SaveOutput,
                                         uint64_t &ReqPtrArgOffset) {
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
@@ -380,7 +392,7 @@
                                       ptrdiff_t *TgtOffsets, int32_t NumArgs,
                                       int32_t NumTeams, int32_t ThreadLimit,
                                       uint64_t LoopTripCount) {
-
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   if (checkDeviceAndCtors(DeviceId, Loc)) {
     DP("Not offloading to device %" PRId64 "\n", DeviceId);
@@ -431,6 +443,7 @@
 }
 
 EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
+  assert(PM && "Runtime not initialized");
   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
   InfoLevel.store(NewInfoLevel);
   for (auto &R : PM->pluginAdaptors()) {
@@ -440,6 +453,7 @@
 }
 
 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
+  assert(PM && "Runtime not initialized");
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -448,7 +462,9 @@
 }
 
 EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+
   if (!AsyncHandle || !*AsyncHandle) {
     FATAL_MESSAGE0(
         1, "Receive an invalid async handle from the current OpenMP task. Is "
diff --git a/libomptarget/test/offloading/runtime_init.c b/libomptarget/test/offloading/runtime_init.c
new file mode 100644
index 0000000..96fd50f
--- /dev/null
+++ b/libomptarget/test/offloading/runtime_init.c
@@ -0,0 +1,30 @@
+// RUN: %libomptarget-compile-generic
+// RUN:   env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
+// RUN: %fcheck-generic
+
+// REQUIRES: libomptarget-debug
+
+#include <omp.h>
+#include <stdio.h>
+
+extern void __tgt_rtl_init(void);
+extern void __tgt_rtl_deinit(void);
+
+// Sanity checks to make sure that this works and is thread safe.
+int main() {
+  // CHECK: Init offload library!
+  // CHECK: Deinit offload library!
+  __tgt_rtl_init();
+#pragma omp parallel num_threads(8)
+  {
+    __tgt_rtl_init();
+    __tgt_rtl_deinit();
+  }
+  __tgt_rtl_deinit();
+
+  __tgt_rtl_init();
+  __tgt_rtl_deinit();
+
+  // CHECK: PASS
+  printf("PASS\n");
+}