[Libomptarget] Replace global `PluginTy::get` interface with references (#86595)

Summary:
We have a plugin singleton that implements the Plugin interface. This
then spawns separate device and kernels. Previously when these needed to
reach into the global singleton they would use the `PluginTy::get`
routine to get access to it. In the future we will move away from this
as the lifetime of the plugin will be handled by `libomptarget`
directly. This patch removes uses of this inside of the plugin
implementaion themselves by simply keeping a reference to the plugin
inside of the device.

The external `__tgt_rtl` functions still use the global method, but will
be removed later.

GitOrigin-RevId: 4dc322524871df02578a05b260dacf28ff130c02
diff --git a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 14461f1..2dd08dd 100644
--- a/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -371,7 +371,8 @@
 struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 
   /// Create an empty memory manager.
-  AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {}
+  AMDGPUMemoryManagerTy(AMDGPUPluginTy &Plugin)
+      : Plugin(Plugin), MemoryPool(nullptr), MemoryManager(nullptr) {}
 
   /// Initialize the memory manager from a memory pool.
   Error init(AMDGPUMemoryPoolTy &MemoryPool) {
@@ -429,6 +430,9 @@
     return OFFLOAD_SUCCESS;
   }
 
+  /// The underlying plugin that owns this memory manager.
+  AMDGPUPluginTy &Plugin;
+
   /// The memory pool used to allocate memory.
   AMDGPUMemoryPoolTy *MemoryPool;
 
@@ -1744,9 +1748,10 @@
 /// HSA host agent. We aggregate all its resources into the same instance.
 struct AMDHostDeviceTy : public AMDGenericDeviceTy {
   /// Create a host device from an array of host agents.
-  AMDHostDeviceTy(const llvm::SmallVector<hsa_agent_t> &HostAgents)
-      : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(),
-        PinnedMemoryManager() {
+  AMDHostDeviceTy(AMDGPUPluginTy &Plugin,
+                  const llvm::SmallVector<hsa_agent_t> &HostAgents)
+      : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(Plugin),
+        PinnedMemoryManager(Plugin) {
     assert(HostAgents.size() && "No host agent found");
   }
 
@@ -1840,9 +1845,10 @@
 /// generic device class.
 struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   // Create an AMDGPU device with a device id and default AMDGPU grid values.
-  AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices,
+  AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
                  AMDHostDeviceTy &HostDevice, hsa_agent_t Agent)
-      : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(),
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, {0}),
+        AMDGenericDeviceTy(),
         OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4),
         OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512),
         OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4),
@@ -2088,7 +2094,7 @@
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the AMDGPU kernel.
-    AMDGPUKernelTy *AMDGPUKernel = PluginTy::get().allocate<AMDGPUKernelTy>();
+    AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
@@ -2138,8 +2144,7 @@
   Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
                                            int32_t ImageId) override {
     // Allocate and initialize the image object.
-    AMDGPUDeviceImageTy *AMDImage =
-        PluginTy::get().allocate<AMDGPUDeviceImageTy>();
+    AMDGPUDeviceImageTy *AMDImage = Plugin.allocate<AMDGPUDeviceImageTy>();
     new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the HSA executable.
@@ -2697,7 +2702,7 @@
   }
   Error setDeviceHeapSize(uint64_t Value) override {
     for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(PluginTy::get(), *Image, Value))
+      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
         return Err;
     DeviceMemoryPoolSize = Value;
     return Plugin::success();
@@ -2737,7 +2742,7 @@
     return utils::iterateAgentMemoryPools(
         Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) {
           AMDGPUMemoryPoolTy *MemoryPool =
-              PluginTy::get().allocate<AMDGPUMemoryPoolTy>();
+              Plugin.allocate<AMDGPUMemoryPoolTy>();
           new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool);
           AllMemoryPools.push_back(MemoryPool);
           return HSA_STATUS_SUCCESS;
@@ -3090,7 +3095,7 @@
 
     // Initialize the host device using host agents.
     HostDevice = allocate<AMDHostDeviceTy>();
-    new (HostDevice) AMDHostDeviceTy(HostAgents);
+    new (HostDevice) AMDHostDeviceTy(*this, HostAgents);
 
     // Setup the memory pools of available for the host.
     if (auto Err = HostDevice->init())
@@ -3116,8 +3121,9 @@
   }
 
   /// Creates an AMDGPU device.
-  GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices) override {
-    return new AMDGPUDeviceTy(DeviceId, NumDevices, getHostDevice(),
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override {
+    return new AMDGPUDeviceTy(Plugin, DeviceId, NumDevices, getHostDevice(),
                               getKernelAgent(DeviceId));
   }
 
@@ -3248,7 +3254,9 @@
   // 56 bytes per allocation.
   uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
 
-  AMDHostDeviceTy &HostDevice = PluginTy::get<AMDGPUPluginTy>().getHostDevice();
+  AMDGPUPluginTy &AMDGPUPlugin =
+      static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
+  AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice();
   AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager();
 
   void *AllArgs = nullptr;
@@ -3385,7 +3393,7 @@
   }
   assert(Ptr && "Invalid pointer");
 
-  auto &KernelAgents = PluginTy::get<AMDGPUPluginTy>().getKernelAgents();
+  auto &KernelAgents = Plugin.getKernelAgents();
 
   // Allow all kernel agents to access the allocation.
   if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) {
@@ -3428,7 +3436,8 @@
   }
 
   if (Alloc) {
-    auto &KernelAgents = PluginTy::get<AMDGPUPluginTy>().getKernelAgents();
+    auto &KernelAgents =
+        static_cast<AMDGPUPluginTy &>(Plugin).getKernelAgents();
     // Inherently necessary for host or shared allocations
     // Also enabled for device memory to allow device to device memcpy
 
diff --git a/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 1440f8f..b16b081 100644
--- a/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -610,7 +610,7 @@
 struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Construct a device with its device id within the plugin, the number of
   /// devices in the plugin and the grid values for that kind of device.
-  GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+  GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
                   const llvm::omp::GV &GridValues);
 
   /// Get the device identifier within the corresponding plugin. Notice that
@@ -860,6 +860,9 @@
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+  /// Reference to the underlying plugin that created this device.
+  GenericPluginTy &Plugin;
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -977,7 +980,8 @@
   virtual Error deinitImpl() = 0;
 
   /// Create a new device for the underlying plugin.
-  virtual GenericDeviceTy *createDevice(int32_t DeviceID,
+  virtual GenericDeviceTy *createDevice(GenericPluginTy &Plugin,
+                                        int32_t DeviceID,
                                         int32_t NumDevices) = 0;
 
   /// Create a new global handler for the underlying plugin.
diff --git a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 4db787a..c32bd08 100644
--- a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -438,7 +438,7 @@
   // Retrieve kernel environment object for the kernel.
   GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
                      sizeof(KernelEnvironment), &KernelEnvironment);
-  GenericGlobalHandlerTy &GHandler = PluginTy::get().getGlobalHandler();
+  GenericGlobalHandlerTy &GHandler = GenericDevice.Plugin.getGlobalHandler();
   if (auto Err =
           GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
@@ -710,9 +710,10 @@
   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
 }
 
-GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
+                                 int32_t NumDevices,
                                  const llvm::omp::GV &OMPGridValues)
-    : MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
+    : Plugin(Plugin), MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
       OMP_NumTeams("OMP_NUM_TEAMS"),
       OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
       OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
@@ -1522,7 +1523,7 @@
   assert(!Devices[DeviceId] && "Device already initialized");
 
   // Create the device and save the reference.
-  GenericDeviceTy *Device = createDevice(DeviceId, NumDevices);
+  GenericDeviceTy *Device = createDevice(*this, DeviceId, NumDevices);
   assert(Device && "Invalid device");
 
   // Save the device reference into the list.
diff --git a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index d43c723..fc74c6a 100644
--- a/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -255,8 +255,8 @@
 /// generic device class.
 struct CUDADeviceTy : public GenericDeviceTy {
   // Create a CUDA device with a device id and the default CUDA grid values.
-  CUDADeviceTy(int32_t DeviceId, int32_t NumDevices)
-      : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues),
+  CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
         CUDAStreamManager(*this), CUDAEventManager(*this) {}
 
   ~CUDADeviceTy() {}
@@ -471,7 +471,7 @@
   /// Allocate and construct a CUDA kernel.
   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the CUDA kernel.
-    CUDAKernelTy *CUDAKernel = PluginTy::get().allocate<CUDAKernelTy>();
+    CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
@@ -529,8 +529,7 @@
       return std::move(Err);
 
     // Allocate and initialize the image object.
-    CUDADeviceImageTy *CUDAImage =
-        PluginTy::get().allocate<CUDADeviceImageTy>();
+    CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
     new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the CUDA module.
@@ -1373,8 +1372,9 @@
   Error deinitImpl() override { return Plugin::success(); }
 
   /// Creates a CUDA device to use for offloading.
-  GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices) override {
-    return new CUDADeviceTy(DeviceId, NumDevices);
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override {
+    return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
   }
 
   /// Creates a CUDA global handler.
diff --git a/libomptarget/plugins-nextgen/host/src/rtl.cpp b/libomptarget/plugins-nextgen/host/src/rtl.cpp
index ea8ed8d..f0ce242 100644
--- a/libomptarget/plugins-nextgen/host/src/rtl.cpp
+++ b/libomptarget/plugins-nextgen/host/src/rtl.cpp
@@ -66,7 +66,7 @@
     GlobalTy Global(getName(), 0);
 
     // Get the metadata (address) of the kernel function.
-    GenericGlobalHandlerTy &GHandler = PluginTy::get().getGlobalHandler();
+    GenericGlobalHandlerTy &GHandler = Device.Plugin.getGlobalHandler();
     if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, Global))
       return Err;
 
@@ -132,8 +132,9 @@
 /// Class implementing the device functionalities for GenELF64.
 struct GenELF64DeviceTy : public GenericDeviceTy {
   /// Create the device with a specific id.
-  GenELF64DeviceTy(int32_t DeviceId, int32_t NumDevices)
-      : GenericDeviceTy(DeviceId, NumDevices, GenELF64GridValues) {}
+  GenELF64DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
+                   int32_t NumDevices)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, GenELF64GridValues) {}
 
   ~GenELF64DeviceTy() {}
 
@@ -149,8 +150,7 @@
   /// Construct the kernel for a specific image on the device.
   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the kernel.
-    GenELF64KernelTy *GenELF64Kernel =
-        PluginTy::get().allocate<GenELF64KernelTy>();
+    GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
@@ -166,8 +166,7 @@
   Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
                                            int32_t ImageId) override {
     // Allocate and initialize the image object.
-    GenELF64DeviceImageTy *Image =
-        PluginTy::get().allocate<GenELF64DeviceImageTy>();
+    GenELF64DeviceImageTy *Image = Plugin.allocate<GenELF64DeviceImageTy>();
     new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
 
     // Create a temporary file.
@@ -400,8 +399,9 @@
   Error deinitImpl() override { return Plugin::success(); }
 
   /// Creates a generic ELF device.
-  GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices) override {
-    return new GenELF64DeviceTy(DeviceId, NumDevices);
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override {
+    return new GenELF64DeviceTy(Plugin, DeviceId, NumDevices);
   }
 
   /// Creates a generic global handler.