[Libomptarget] Deinitialize AMDGPU global state more intentionally

A previous patch made the destruction of the HSA plugin more
deterministic. However, there were still other global values that are not
handled this way. When attempting to call a destructor kernel, the
device would have already been uninitialized and we could not find the
appropriate kernel to call. This is because they were stored in global
containers that had their destructors called already. Merges this global
state into the rest of the info state by putting those global values
inside of the global pointer already allocated and deallocated by the
constructor and destructor. This should allow the AMDGPU plugin to
correctly identify the destructors if we were to run them.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D131011

GitOrigin-RevId: 2b7203a35972e98b8521f92d2791043dc539ae88
diff --git a/libomptarget/plugins/amdgpu/src/rtl.cpp b/libomptarget/plugins/amdgpu/src/rtl.cpp
index 3eb1d0f..6b95756 100644
--- a/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -210,9 +210,6 @@
 };
 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
 
-std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
-    KernelArgPoolMap;
-
 /// Use a single entity to encode a kernel and a set of flags
 struct KernelTy {
   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
@@ -224,7 +221,9 @@
   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
            int32_t DeviceId, void *CallStackAddr, const char *Name,
            uint32_t KernargSegmentSize,
-           hsa_amd_memory_pool_t &KernArgMemoryPool)
+           hsa_amd_memory_pool_t &KernArgMemoryPool,
+           std::unordered_map<std::string, std::unique_ptr<KernelArgPool>>
+               &KernelArgPoolMap)
       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -238,10 +237,6 @@
   }
 };
 
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
 template <typename Callback> static hsa_status_t findAgents(Callback CB) {
 
   hsa_status_t Err =
@@ -456,6 +451,12 @@
 
   int NumberOfDevices = 0;
 
+  /// List that contains all the kernels.
+  /// FIXME: we may need this to be per device and per library.
+  std::list<KernelTy> KernelsList;
+  std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
+      KernelArgPoolMap;
+
   // GPU devices
   std::vector<hsa_agent_t> HSAAgents;
   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
@@ -857,7 +858,6 @@
            "Unexpected device id!");
     FuncGblEntries[DeviceId].emplace_back();
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-    // KernelArgPoolMap.clear();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
@@ -1116,19 +1116,6 @@
 static RTLDeviceInfoTy *DeviceInfoState = nullptr;
 static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; }
 
-int32_t __tgt_rtl_init_plugin() {
-  DeviceInfoState = new RTLDeviceInfoTy;
-  return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
-             ? OFFLOAD_SUCCESS
-             : OFFLOAD_FAIL;
-}
-
-int32_t __tgt_rtl_deinit_plugin() {
-  if (DeviceInfoState)
-    delete DeviceInfoState;
-  return OFFLOAD_SUCCESS;
-}
-
 namespace {
 
 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
@@ -1171,7 +1158,7 @@
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
   Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
-                                           DeviceId);
+                                             DeviceId);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1468,8 +1455,9 @@
     KernelArgPool *ArgPool = nullptr;
     void *KernArg = nullptr;
     {
-      auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (It != KernelArgPoolMap.end()) {
+      auto It =
+          DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name));
+      if (It != DeviceInfo().KernelArgPoolMap.end()) {
         ArgPool = (It->second).get();
       }
     }
@@ -2031,6 +2019,20 @@
 }
 
 extern "C" {
+
+int32_t __tgt_rtl_init_plugin() {
+  DeviceInfoState = new RTLDeviceInfoTy;
+  return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
+             ? OFFLOAD_SUCCESS
+             : OFFLOAD_FAIL;
+}
+
+int32_t __tgt_rtl_deinit_plugin() {
+  if (DeviceInfoState)
+    delete DeviceInfoState;
+  return OFFLOAD_SUCCESS;
+}
+
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
   return elfMachineIdIsAmdgcn(Image);
 }
@@ -2373,8 +2375,8 @@
         }
 
         // write ptr to device memory so it can be used by later kernels
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
-                                                 DeviceId);
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
+                                                   sizeof(void *), DeviceId);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("memcpy install of state_ptr failed\n");
           return NULL;
@@ -2437,8 +2439,8 @@
         // If unified memory is present any target link variables
         // can access host addresses directly. There is no longer a
         // need for device copies.
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
-                                                 sizeof(void *), DeviceId);
+        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
+                                                 DeviceId);
         if (Err != HSA_STATUS_SUCCESS)
           DP("Error when copying USM\n");
         DP("Copy linked variable host address (" DPxMOD ")"
@@ -2598,11 +2600,12 @@
     }
     check("Loading computation property", Err);
 
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
-                                   CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo().KernArgPool));
+    DeviceInfo().KernelsList.push_back(
+        KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name,
+                 KernargSegmentSize, DeviceInfo().KernArgPool,
+                 DeviceInfo().KernelArgPoolMap));
     __tgt_offload_entry Entry = *E;
-    Entry.addr = (void *)&KernelsList.back();
+    Entry.addr = (void *)&DeviceInfo().KernelsList.back();
     DeviceInfo().addOffloadEntry(DeviceId, Entry);
     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
   }