offload/liboffload/src/OffloadImpl.cpp - llvm-project - Git at Google

 //===- ol_impl.cpp - Implementation of the new LLVM/Offload API ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This contains the definitions of the new LLVM/Offload API entry points. See
 // new-api/API/README.md for more information.
 //
 //===----------------------------------------------------------------------===//

 #include "OffloadImpl.hpp"
 #include "Helpers.hpp"
 #include "PluginManager.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <OffloadAPI.h>

 #include <mutex>

 // TODO: Some plugins expect to be linked into libomptarget which defines these
 // symbols to implement ompt callbacks. The least invasive workaround here is to
 // define them in libLLVMOffload as false/null so they are never used. In future
 // it would be better to allow the plugins to implement callbacks without
 // pulling in details from libomptarget.
 #ifdef OMPT_SUPPORT
 namespace llvm::omp::target {
 namespace ompt {
 bool Initialized = false;
 ompt_get_callback_t lookupCallbackByCode = nullptr;
 ompt_function_lookup_t lookupCallbackByName = nullptr;
 } // namespace ompt
 } // namespace llvm::omp::target
 #endif

 using namespace llvm::omp::target;
 using namespace llvm::omp::target::plugin;

 // Handle type definitions. Ideally these would be 1:1 with the plugins, but
 // we add some additional data here for now to avoid churn in the plugin
 // interface.
 struct ol_device_impl_t {
   ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device,
                    ol_platform_handle_t Platform)
       : DeviceNum(DeviceNum), Device(Device), Platform(Platform) {}
   int DeviceNum;
   GenericDeviceTy *Device;
   ol_platform_handle_t Platform;
 };

 struct ol_platform_impl_t {
   ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
                      std::vector<ol_device_impl_t> Devices,
                      ol_platform_backend_t BackendType)
       : Plugin(std::move(Plugin)), Devices(Devices), BackendType(BackendType) {}
   std::unique_ptr<GenericPluginTy> Plugin;
   std::vector<ol_device_impl_t> Devices;
   ol_platform_backend_t BackendType;
 };

 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
       : AsyncInfo(AsyncInfo), Device(Device) {}
   __tgt_async_info *AsyncInfo;
   ol_device_handle_t Device;
 };

 struct ol_event_impl_t {
   ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
       : EventInfo(EventInfo), Queue(Queue) {}
   ~ol_event_impl_t() { (void)Queue->Device->Device->destroyEvent(EventInfo); }
   void *EventInfo;
   ol_queue_handle_t Queue;
 };

 struct ol_program_impl_t {
   ol_program_impl_t(plugin::DeviceImageTy *Image,
                     std::unique_ptr<llvm::MemoryBuffer> ImageData,
                     const __tgt_device_image &DeviceImage)
       : Image(Image), ImageData(std::move(ImageData)),
         DeviceImage(DeviceImage) {}
   plugin::DeviceImageTy *Image;
   std::unique_ptr<llvm::MemoryBuffer> ImageData;
   __tgt_device_image DeviceImage;
 };

 namespace llvm {
 namespace offload {

 struct AllocInfo {
   ol_device_handle_t Device;
   ol_alloc_type_t Type;
 };

 using AllocInfoMapT = DenseMap<void *, AllocInfo>;
 AllocInfoMapT &allocInfoMap() {
   static AllocInfoMapT AllocInfoMap{};
   return AllocInfoMap;
 }

 using PlatformVecT = SmallVector<ol_platform_impl_t, 4>;
 PlatformVecT &Platforms() {
   static PlatformVecT Platforms;
   return Platforms;
 }

 ol_device_handle_t HostDevice() {
   // The host platform is always inserted last
   return &Platforms().back().Devices[0];
 }

 template <typename HandleT> ol_impl_result_t olDestroy(HandleT Handle) {
   delete Handle;
   return OL_SUCCESS;
 }

 constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
   if (Name == "amdgpu") {
     return OL_PLATFORM_BACKEND_AMDGPU;
   } else if (Name == "cuda") {
     return OL_PLATFORM_BACKEND_CUDA;
   } else {
     return OL_PLATFORM_BACKEND_UNKNOWN;
   }
 }

 // Every plugin exports this method to create an instance of the plugin type.
 #define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name();
 #include "Shared/Targets.def"

 void initPlugins() {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
     Platforms().emplace_back(ol_platform_impl_t{                               \
         std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),               \
         {},                                                                    \
         pluginNameToBackend(#Name)});                                          \
   } while (false);
 #include "Shared/Targets.def"

   // Preemptively initialize all devices in the plugin
   for (auto &Platform : Platforms()) {
     // Do not use the host plugin - it isn't supported.
     if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN)
       continue;
     auto Err = Platform.Plugin->init();
     [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
     for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
          DevNum++) {
       if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
         Platform.Devices.emplace_back(ol_device_impl_t{
             DevNum, &Platform.Plugin->getDevice(DevNum), &Platform});
       }
     }
   }

   // Add the special host device
   auto &HostPlatform = Platforms().emplace_back(
       ol_platform_impl_t{nullptr,
                          {ol_device_impl_t{-1, nullptr, nullptr}},
                          OL_PLATFORM_BACKEND_HOST});
   HostDevice()->Platform = &HostPlatform;

   offloadConfig().TracingEnabled = std::getenv("OFFLOAD_TRACE");
   offloadConfig().ValidationEnabled =
       !std::getenv("OFFLOAD_DISABLE_VALIDATION");
 }

 // TODO: We can properly reference count here and manage the resources in a more
 // clever way
 ol_impl_result_t olInit_impl() {
   static std::once_flag InitFlag;
   std::call_once(InitFlag, initPlugins);

   return OL_SUCCESS;
 }
 ol_impl_result_t olShutDown_impl() { return OL_SUCCESS; }

 ol_impl_result_t olGetPlatformInfoImplDetail(ol_platform_handle_t Platform,
                                              ol_platform_info_t PropName,
                                              size_t PropSize, void *PropValue,
                                              size_t *PropSizeRet) {
   ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
   bool IsHost = Platform->BackendType == OL_PLATFORM_BACKEND_HOST;

   switch (PropName) {
   case OL_PLATFORM_INFO_NAME:
     return ReturnValue(IsHost ? "Host" : Platform->Plugin->getName());
   case OL_PLATFORM_INFO_VENDOR_NAME:
     // TODO: Implement this
     return ReturnValue("Unknown platform vendor");
   case OL_PLATFORM_INFO_VERSION: {
     return ReturnValue(formatv("v{0}.{1}.{2}", OL_VERSION_MAJOR,
                                OL_VERSION_MINOR, OL_VERSION_PATCH)
                            .str()
                            .c_str());
   }
   case OL_PLATFORM_INFO_BACKEND: {
     return ReturnValue(Platform->BackendType);
   }
   default:
     return OL_ERRC_INVALID_ENUMERATION;
   }

   return OL_SUCCESS;
 }

 ol_impl_result_t olGetPlatformInfo_impl(ol_platform_handle_t Platform,
                                         ol_platform_info_t PropName,
                                         size_t PropSize, void *PropValue) {
   return olGetPlatformInfoImplDetail(Platform, PropName, PropSize, PropValue,
                                      nullptr);
 }

 ol_impl_result_t olGetPlatformInfoSize_impl(ol_platform_handle_t Platform,
                                             ol_platform_info_t PropName,
                                             size_t *PropSizeRet) {
   return olGetPlatformInfoImplDetail(Platform, PropName, 0, nullptr,
                                      PropSizeRet);
 }

 ol_impl_result_t olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                                            ol_device_info_t PropName,
                                            size_t PropSize, void *PropValue,
                                            size_t *PropSizeRet) {

   ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);

   // Find the info if it exists under any of the given names
   auto GetInfo = [&](std::vector<std::string> Names) {
     InfoQueueTy DevInfo;
     if (auto Err = Device->Device->obtainInfoImpl(DevInfo))
       return std::string("");

     for (auto Name : Names) {
       auto InfoKeyMatches = [&](const InfoQueueTy::InfoQueueEntryTy &Info) {
         return Info.Key == Name;
       };
       auto Item = std::find_if(DevInfo.getQueue().begin(),
                                DevInfo.getQueue().end(), InfoKeyMatches);

       if (Item != std::end(DevInfo.getQueue())) {
         return Item->Value;
       }
     }

     return std::string("");
   };

   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
     return ReturnValue(Device->Platform);
   case OL_DEVICE_INFO_TYPE:
     return ReturnValue(OL_DEVICE_TYPE_GPU);
   case OL_DEVICE_INFO_NAME:
     return ReturnValue(GetInfo({"Device Name"}).c_str());
   case OL_DEVICE_INFO_VENDOR:
     return ReturnValue(GetInfo({"Vendor Name"}).c_str());
   case OL_DEVICE_INFO_DRIVER_VERSION:
     return ReturnValue(
         GetInfo({"CUDA Driver Version", "HSA Runtime Version"}).c_str());
   default:
     return OL_ERRC_INVALID_ENUMERATION;
   }

   return OL_SUCCESS;
 }

 ol_impl_result_t olGetDeviceInfo_impl(ol_device_handle_t Device,
                                       ol_device_info_t PropName,
                                       size_t PropSize, void *PropValue) {
   return olGetDeviceInfoImplDetail(Device, PropName, PropSize, PropValue,
                                    nullptr);
 }

 ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
                                           ol_device_info_t PropName,
                                           size_t *PropSizeRet) {
   return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet);
 }

 ol_impl_result_t olIterateDevices_impl(ol_device_iterate_cb_t Callback,
                                        void *UserData) {
   for (auto &Platform : Platforms()) {
     for (auto &Device : Platform.Devices) {
       if (!Callback(&Device, UserData)) {
         break;
       }
     }
   }

   return OL_SUCCESS;
 }

 TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
   switch (Type) {
   case OL_ALLOC_TYPE_DEVICE:
     return TARGET_ALLOC_DEVICE;
   case OL_ALLOC_TYPE_HOST:
     return TARGET_ALLOC_HOST;
   case OL_ALLOC_TYPE_MANAGED:
   default:
     return TARGET_ALLOC_SHARED;
   }
 }

 ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
                                  ol_alloc_type_t Type, size_t Size,
                                  void **AllocationOut) {
   auto Alloc =
       Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
   if (!Alloc)
     return {OL_ERRC_OUT_OF_RESOURCES,
             formatv("Could not create allocation on device {0}", Device).str()};

   *AllocationOut = *Alloc;
   allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type});
   return OL_SUCCESS;
 }

 ol_impl_result_t olMemFree_impl(void *Address) {
   if (!allocInfoMap().contains(Address))
     return {OL_ERRC_INVALID_ARGUMENT, "Address is not a known allocation"};

   auto AllocInfo = allocInfoMap().at(Address);
   auto Device = AllocInfo.Device;
   auto Type = AllocInfo.Type;

   auto Res =
       Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type));
   if (Res)
     return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"};

   allocInfoMap().erase(Address);

   return OL_SUCCESS;
 }

 ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
                                     ol_queue_handle_t *Queue) {
   auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device);
   auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo));
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"};

   *Queue = CreatedQueue.release();
   return OL_SUCCESS;
 }

 ol_impl_result_t olDestroyQueue_impl(ol_queue_handle_t Queue) {
   return olDestroy(Queue);
 }

 ol_impl_result_t olWaitQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
     auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo);
     if (Err)
       return {OL_ERRC_INVALID_QUEUE, "The queue failed to synchronize"};
   }

   // Recreate the stream resource so the queue can be reused
   // TODO: Would be easier for the synchronization to (optionally) not release
   // it to begin with.
   auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo);
   if (Res)
     return {OL_ERRC_UNKNOWN, "Could not reinitialize the stream resource"};

   return OL_SUCCESS;
 }

 ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) {
   auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo);
   if (Res)
     return {OL_ERRC_INVALID_EVENT, "The event failed to synchronize"};

   return OL_SUCCESS;
 }

 ol_impl_result_t olDestroyEvent_impl(ol_event_handle_t Event) {
   return olDestroy(Event);
 }

 ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
   auto EventImpl = std::make_unique<ol_event_impl_t>(nullptr, Queue);
   auto Res = Queue->Device->Device->createEvent(&EventImpl->EventInfo);
   if (Res)
     return nullptr;

   Res = Queue->Device->Device->recordEvent(EventImpl->EventInfo,
                                            Queue->AsyncInfo);
   if (Res)
     return nullptr;

   return EventImpl.release();
 }

 ol_impl_result_t olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
                                ol_device_handle_t DstDevice, void *SrcPtr,
                                ol_device_handle_t SrcDevice, size_t Size,
                                ol_event_handle_t *EventOut) {
   if (DstDevice == HostDevice() && SrcDevice == HostDevice()) {
     if (!Queue) {
       std::memcpy(DstPtr, SrcPtr, Size);
       return OL_SUCCESS;
     } else {
       return {OL_ERRC_INVALID_ARGUMENT,
               "One of DstDevice and SrcDevice must be a non-host device if "
               "Queue is specified"};
     }
   }

   // If no queue is given the memcpy will be synchronous
   auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;

   if (DstDevice == HostDevice()) {
     auto Res = SrcDevice->Device->dataRetrieve(DstPtr, SrcPtr, Size, QueueImpl);
     if (Res)
       return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"};
   } else if (SrcDevice == HostDevice()) {
     auto Res = DstDevice->Device->dataSubmit(DstPtr, SrcPtr, Size, QueueImpl);
     if (Res)
       return {OL_ERRC_UNKNOWN, "The data submit operation failed"};
   } else {
     auto Res = SrcDevice->Device->dataExchange(SrcPtr, *DstDevice->Device,
                                                DstPtr, Size, QueueImpl);
     if (Res)
       return {OL_ERRC_UNKNOWN, "The data exchange operation failed"};
   }

   if (EventOut)
     *EventOut = makeEvent(Queue);

   return OL_SUCCESS;
 }

 ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device,
                                       const void *ProgData, size_t ProgDataSize,
                                       ol_program_handle_t *Program) {
   // Make a copy of the program binary in case it is released by the caller.
   auto ImageData = MemoryBuffer::getMemBufferCopy(
       StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize));

   auto DeviceImage = __tgt_device_image{
       const_cast<char *>(ImageData->getBuffer().data()),
       const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr,
       nullptr};

   ol_program_handle_t Prog =
       new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage);

   auto Res =
       Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
   if (!Res) {
     delete Prog;
     return OL_ERRC_INVALID_VALUE;
   }

   Prog->Image = *Res;
   *Program = Prog;

   return OL_SUCCESS;
 }

 ol_impl_result_t olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }

 ol_impl_result_t olGetKernel_impl(ol_program_handle_t Program,
                                   const char *KernelName,
                                   ol_kernel_handle_t *Kernel) {

   auto &Device = Program->Image->getDevice();
   auto KernelImpl = Device.constructKernel(KernelName);
   if (!KernelImpl)
     return OL_ERRC_INVALID_KERNEL_NAME;

   auto Err = KernelImpl->init(Device, *Program->Image);
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not initialize the kernel"};

   *Kernel = &*KernelImpl;

   return OL_SUCCESS;
 }

 ol_impl_result_t
 olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                     ol_kernel_handle_t Kernel, const void *ArgumentsData,
                     size_t ArgumentsSize,
                     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                     ol_event_handle_t *EventOut) {
   auto *DeviceImpl = Device->Device;
   if (Queue && Device != Queue->Device) {
     return {OL_ERRC_INVALID_DEVICE,
             "Device specified does not match the device of the given queue"};
   }

   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
   LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
   LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
   LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX;
   LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
   LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;

   KernelLaunchParamsTy Params;
   Params.Data = const_cast<void *>(ArgumentsData);
   Params.Size = ArgumentsSize;
   LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;

   auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
                                 LaunchArgs, AsyncInfoWrapper);

   AsyncInfoWrapper.finalize(Err);
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not finalize the AsyncInfoWrapper"};

   if (EventOut)
     *EventOut = makeEvent(Queue);

   return OL_SUCCESS;
 }

 } // namespace offload
 } // namespace llvm
	//===- ol_impl.cpp - Implementation of the new LLVM/Offload API ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This contains the definitions of the new LLVM/Offload API entry points. See
	// new-api/API/README.md for more information.
	//
	//===----------------------------------------------------------------------===//

	#include "OffloadImpl.hpp"
	#include "Helpers.hpp"
	#include "PluginManager.h"
	#include "llvm/Support/FormatVariadic.h"
	#include <OffloadAPI.h>

	#include <mutex>

	// TODO: Some plugins expect to be linked into libomptarget which defines these
	// symbols to implement ompt callbacks. The least invasive workaround here is to
	// define them in libLLVMOffload as false/null so they are never used. In future
	// it would be better to allow the plugins to implement callbacks without
	// pulling in details from libomptarget.
	#ifdef OMPT_SUPPORT
	namespace llvm::omp::target {
	namespace ompt {
	bool Initialized = false;
	ompt_get_callback_t lookupCallbackByCode = nullptr;
	ompt_function_lookup_t lookupCallbackByName = nullptr;
	} // namespace ompt
	} // namespace llvm::omp::target
	#endif

	using namespace llvm::omp::target;
	using namespace llvm::omp::target::plugin;

	// Handle type definitions. Ideally these would be 1:1 with the plugins, but
	// we add some additional data here for now to avoid churn in the plugin
	// interface.
	struct ol_device_impl_t {
	ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device,
	ol_platform_handle_t Platform)
	: DeviceNum(DeviceNum), Device(Device), Platform(Platform) {}
	int DeviceNum;
	GenericDeviceTy *Device;
	ol_platform_handle_t Platform;
	};

	struct ol_platform_impl_t {
	ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
	std::vector<ol_device_impl_t> Devices,
	ol_platform_backend_t BackendType)
	: Plugin(std::move(Plugin)), Devices(Devices), BackendType(BackendType) {}
	std::unique_ptr<GenericPluginTy> Plugin;
	std::vector<ol_device_impl_t> Devices;
	ol_platform_backend_t BackendType;
	};

	struct ol_queue_impl_t {
	ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
	: AsyncInfo(AsyncInfo), Device(Device) {}
	__tgt_async_info *AsyncInfo;
	ol_device_handle_t Device;
	};

	struct ol_event_impl_t {
	ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue)
	: EventInfo(EventInfo), Queue(Queue) {}
	~ol_event_impl_t() { (void)Queue->Device->Device->destroyEvent(EventInfo); }
	void *EventInfo;
	ol_queue_handle_t Queue;
	};

	struct ol_program_impl_t {
	ol_program_impl_t(plugin::DeviceImageTy *Image,
	std::unique_ptr<llvm::MemoryBuffer> ImageData,
	const __tgt_device_image &DeviceImage)
	: Image(Image), ImageData(std::move(ImageData)),
	DeviceImage(DeviceImage) {}
	plugin::DeviceImageTy *Image;
	std::unique_ptr<llvm::MemoryBuffer> ImageData;
	__tgt_device_image DeviceImage;
	};

	namespace llvm {
	namespace offload {

	struct AllocInfo {
	ol_device_handle_t Device;
	ol_alloc_type_t Type;
	};

	using AllocInfoMapT = DenseMap<void *, AllocInfo>;
	AllocInfoMapT &allocInfoMap() {
	static AllocInfoMapT AllocInfoMap{};
	return AllocInfoMap;
	}

	using PlatformVecT = SmallVector<ol_platform_impl_t, 4>;
	PlatformVecT &Platforms() {
	static PlatformVecT Platforms;
	return Platforms;
	}

	ol_device_handle_t HostDevice() {
	// The host platform is always inserted last
	return &Platforms().back().Devices[0];
	}

	template <typename HandleT> ol_impl_result_t olDestroy(HandleT Handle) {
	delete Handle;
	return OL_SUCCESS;
	}

	constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
	if (Name == "amdgpu") {
	return OL_PLATFORM_BACKEND_AMDGPU;
	} else if (Name == "cuda") {
	return OL_PLATFORM_BACKEND_CUDA;
	} else {
	return OL_PLATFORM_BACKEND_UNKNOWN;
	}
	}

	// Every plugin exports this method to create an instance of the plugin type.
	#define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name();
	#include "Shared/Targets.def"

	void initPlugins() {
	// Attempt to create an instance of each supported plugin.
	#define PLUGIN_TARGET(Name) \
	do { \
	Platforms().emplace_back(ol_platform_impl_t{ \
	std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), \
	{}, \
	pluginNameToBackend(#Name)}); \
	} while (false);
	#include "Shared/Targets.def"

	// Preemptively initialize all devices in the plugin
	for (auto &Platform : Platforms()) {
	// Do not use the host plugin - it isn't supported.
	if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN)
	continue;
	auto Err = Platform.Plugin->init();
	[[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
	for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
	DevNum++) {
	if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
	Platform.Devices.emplace_back(ol_device_impl_t{
	DevNum, &Platform.Plugin->getDevice(DevNum), &Platform});
	}
	}
	}

	// Add the special host device
	auto &HostPlatform = Platforms().emplace_back(
	ol_platform_impl_t{nullptr,
	{ol_device_impl_t{-1, nullptr, nullptr}},
	OL_PLATFORM_BACKEND_HOST});
	HostDevice()->Platform = &HostPlatform;

	offloadConfig().TracingEnabled = std::getenv("OFFLOAD_TRACE");
	offloadConfig().ValidationEnabled =
	!std::getenv("OFFLOAD_DISABLE_VALIDATION");
	}

	// TODO: We can properly reference count here and manage the resources in a more
	// clever way
	ol_impl_result_t olInit_impl() {
	static std::once_flag InitFlag;
	std::call_once(InitFlag, initPlugins);

	return OL_SUCCESS;
	}
	ol_impl_result_t olShutDown_impl() { return OL_SUCCESS; }

	ol_impl_result_t olGetPlatformInfoImplDetail(ol_platform_handle_t Platform,
	ol_platform_info_t PropName,
	size_t PropSize, void *PropValue,
	size_t *PropSizeRet) {
	ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
	bool IsHost = Platform->BackendType == OL_PLATFORM_BACKEND_HOST;

	switch (PropName) {
	case OL_PLATFORM_INFO_NAME:
	return ReturnValue(IsHost ? "Host" : Platform->Plugin->getName());
	case OL_PLATFORM_INFO_VENDOR_NAME:
	// TODO: Implement this
	return ReturnValue("Unknown platform vendor");
	case OL_PLATFORM_INFO_VERSION: {
	return ReturnValue(formatv("v{0}.{1}.{2}", OL_VERSION_MAJOR,
	OL_VERSION_MINOR, OL_VERSION_PATCH)
	.str()
	.c_str());
	}
	case OL_PLATFORM_INFO_BACKEND: {
	return ReturnValue(Platform->BackendType);
	}
	default:
	return OL_ERRC_INVALID_ENUMERATION;
	}

	return OL_SUCCESS;
	}

	ol_impl_result_t olGetPlatformInfo_impl(ol_platform_handle_t Platform,
	ol_platform_info_t PropName,
	size_t PropSize, void *PropValue) {
	return olGetPlatformInfoImplDetail(Platform, PropName, PropSize, PropValue,
	nullptr);
	}

	ol_impl_result_t olGetPlatformInfoSize_impl(ol_platform_handle_t Platform,
	ol_platform_info_t PropName,
	size_t *PropSizeRet) {
	return olGetPlatformInfoImplDetail(Platform, PropName, 0, nullptr,
	PropSizeRet);
	}

	ol_impl_result_t olGetDeviceInfoImplDetail(ol_device_handle_t Device,
	ol_device_info_t PropName,
	size_t PropSize, void *PropValue,
	size_t *PropSizeRet) {

	ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);

	// Find the info if it exists under any of the given names
	auto GetInfo = [&](std::vector<std::string> Names) {
	InfoQueueTy DevInfo;
	if (auto Err = Device->Device->obtainInfoImpl(DevInfo))
	return std::string("");

	for (auto Name : Names) {
	auto InfoKeyMatches = [&](const InfoQueueTy::InfoQueueEntryTy &Info) {
	return Info.Key == Name;
	};
	auto Item = std::find_if(DevInfo.getQueue().begin(),
	DevInfo.getQueue().end(), InfoKeyMatches);

	if (Item != std::end(DevInfo.getQueue())) {
	return Item->Value;
	}
	}

	return std::string("");
	};

	switch (PropName) {
	case OL_DEVICE_INFO_PLATFORM:
	return ReturnValue(Device->Platform);
	case OL_DEVICE_INFO_TYPE:
	return ReturnValue(OL_DEVICE_TYPE_GPU);
	case OL_DEVICE_INFO_NAME:
	return ReturnValue(GetInfo({"Device Name"}).c_str());
	case OL_DEVICE_INFO_VENDOR:
	return ReturnValue(GetInfo({"Vendor Name"}).c_str());
	case OL_DEVICE_INFO_DRIVER_VERSION:
	return ReturnValue(
	GetInfo({"CUDA Driver Version", "HSA Runtime Version"}).c_str());
	default:
	return OL_ERRC_INVALID_ENUMERATION;
	}

	return OL_SUCCESS;
	}

	ol_impl_result_t olGetDeviceInfo_impl(ol_device_handle_t Device,
	ol_device_info_t PropName,
	size_t PropSize, void *PropValue) {
	return olGetDeviceInfoImplDetail(Device, PropName, PropSize, PropValue,
	nullptr);
	}

	ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
	ol_device_info_t PropName,
	size_t *PropSizeRet) {
	return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet);
	}

	ol_impl_result_t olIterateDevices_impl(ol_device_iterate_cb_t Callback,
	void *UserData) {
	for (auto &Platform : Platforms()) {
	for (auto &Device : Platform.Devices) {
	if (!Callback(&Device, UserData)) {
	break;
	}
	}
	}

	return OL_SUCCESS;
	}

	TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
	switch (Type) {
	case OL_ALLOC_TYPE_DEVICE:
	return TARGET_ALLOC_DEVICE;
	case OL_ALLOC_TYPE_HOST:
	return TARGET_ALLOC_HOST;
	case OL_ALLOC_TYPE_MANAGED:
	default:
	return TARGET_ALLOC_SHARED;
	}
	}

	ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
	ol_alloc_type_t Type, size_t Size,
	void **AllocationOut) {
	auto Alloc =
	Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
	if (!Alloc)
	return {OL_ERRC_OUT_OF_RESOURCES,
	formatv("Could not create allocation on device {0}", Device).str()};

	AllocationOut = Alloc;
	allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type});
	return OL_SUCCESS;
	}

	ol_impl_result_t olMemFree_impl(void *Address) {
	if (!allocInfoMap().contains(Address))
	return {OL_ERRC_INVALID_ARGUMENT, "Address is not a known allocation"};

	auto AllocInfo = allocInfoMap().at(Address);
	auto Device = AllocInfo.Device;
	auto Type = AllocInfo.Type;

	auto Res =
	Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type));
	if (Res)
	return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"};

	allocInfoMap().erase(Address);

	return OL_SUCCESS;
	}

	ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
	ol_queue_handle_t *Queue) {
	auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device);
	auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo));
	if (Err)
	return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"};

	*Queue = CreatedQueue.release();
	return OL_SUCCESS;
	}

	ol_impl_result_t olDestroyQueue_impl(ol_queue_handle_t Queue) {
	return olDestroy(Queue);
	}

	ol_impl_result_t olWaitQueue_impl(ol_queue_handle_t Queue) {
	// Host plugin doesn't have a queue set so it's not safe to call synchronize
	// on it, but we have nothing to synchronize in that situation anyway.
	if (Queue->AsyncInfo->Queue) {
	auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo);
	if (Err)
	return {OL_ERRC_INVALID_QUEUE, "The queue failed to synchronize"};
	}

	// Recreate the stream resource so the queue can be reused
	// TODO: Would be easier for the synchronization to (optionally) not release
	// it to begin with.
	auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo);
	if (Res)
	return {OL_ERRC_UNKNOWN, "Could not reinitialize the stream resource"};

	return OL_SUCCESS;
	}

	ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) {
	auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo);
	if (Res)
	return {OL_ERRC_INVALID_EVENT, "The event failed to synchronize"};

	return OL_SUCCESS;
	}

	ol_impl_result_t olDestroyEvent_impl(ol_event_handle_t Event) {
	return olDestroy(Event);
	}

	ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
	auto EventImpl = std::make_unique<ol_event_impl_t>(nullptr, Queue);
	auto Res = Queue->Device->Device->createEvent(&EventImpl->EventInfo);
	if (Res)
	return nullptr;

	Res = Queue->Device->Device->recordEvent(EventImpl->EventInfo,
	Queue->AsyncInfo);
	if (Res)
	return nullptr;

	return EventImpl.release();
	}

	ol_impl_result_t olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
	ol_device_handle_t DstDevice, void *SrcPtr,
	ol_device_handle_t SrcDevice, size_t Size,
	ol_event_handle_t *EventOut) {
	if (DstDevice == HostDevice() && SrcDevice == HostDevice()) {
	if (!Queue) {
	std::memcpy(DstPtr, SrcPtr, Size);
	return OL_SUCCESS;
	} else {
	return {OL_ERRC_INVALID_ARGUMENT,
	"One of DstDevice and SrcDevice must be a non-host device if "
	"Queue is specified"};
	}
	}

	// If no queue is given the memcpy will be synchronous
	auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;

	if (DstDevice == HostDevice()) {
	auto Res = SrcDevice->Device->dataRetrieve(DstPtr, SrcPtr, Size, QueueImpl);
	if (Res)
	return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"};
	} else if (SrcDevice == HostDevice()) {
	auto Res = DstDevice->Device->dataSubmit(DstPtr, SrcPtr, Size, QueueImpl);
	if (Res)
	return {OL_ERRC_UNKNOWN, "The data submit operation failed"};
	} else {
	auto Res = SrcDevice->Device->dataExchange(SrcPtr, *DstDevice->Device,
	DstPtr, Size, QueueImpl);
	if (Res)
	return {OL_ERRC_UNKNOWN, "The data exchange operation failed"};
	}

	if (EventOut)
	*EventOut = makeEvent(Queue);

	return OL_SUCCESS;
	}

	ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device,
	const void *ProgData, size_t ProgDataSize,
	ol_program_handle_t *Program) {
	// Make a copy of the program binary in case it is released by the caller.
	auto ImageData = MemoryBuffer::getMemBufferCopy(
	StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize));

	auto DeviceImage = __tgt_device_image{
	const_cast<char *>(ImageData->getBuffer().data()),
	const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr,
	nullptr};

	ol_program_handle_t Prog =
	new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage);

	auto Res =
	Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
	if (!Res) {
	delete Prog;
	return OL_ERRC_INVALID_VALUE;
	}

	Prog->Image = *Res;
	*Program = Prog;

	return OL_SUCCESS;
	}

	ol_impl_result_t olDestroyProgram_impl(ol_program_handle_t Program) {
	return olDestroy(Program);
	}

	ol_impl_result_t olGetKernel_impl(ol_program_handle_t Program,
	const char *KernelName,
	ol_kernel_handle_t *Kernel) {

	auto &Device = Program->Image->getDevice();
	auto KernelImpl = Device.constructKernel(KernelName);
	if (!KernelImpl)
	return OL_ERRC_INVALID_KERNEL_NAME;

	auto Err = KernelImpl->init(Device, *Program->Image);
	if (Err)
	return {OL_ERRC_UNKNOWN, "Could not initialize the kernel"};

	Kernel = &KernelImpl;

	return OL_SUCCESS;
	}

	ol_impl_result_t
	olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
	ol_kernel_handle_t Kernel, const void *ArgumentsData,
	size_t ArgumentsSize,
	const ol_kernel_launch_size_args_t *LaunchSizeArgs,
	ol_event_handle_t *EventOut) {
	auto *DeviceImpl = Device->Device;
	if (Queue && Device != Queue->Device) {
	return {OL_ERRC_INVALID_DEVICE,
	"Device specified does not match the device of the given queue"};
	}

	auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
	AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
	KernelArgsTy LaunchArgs{};
	LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
	LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
	LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
	LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX;
	LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
	LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
	LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;

	KernelLaunchParamsTy Params;
	Params.Data = const_cast<void *>(ArgumentsData);
	Params.Size = ArgumentsSize;
	LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
	// Don't do anything with pointer indirection; use arg data as-is
	LaunchArgs.Flags.IsCUDA = true;

	auto KernelImpl = reinterpret_cast<GenericKernelTy >(Kernel);
	auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
	LaunchArgs, AsyncInfoWrapper);

	AsyncInfoWrapper.finalize(Err);
	if (Err)
	return {OL_ERRC_UNKNOWN, "Could not finalize the AsyncInfoWrapper"};

	if (EventOut)
	*EventOut = makeEvent(Queue);

	return OL_SUCCESS;
	}

	} // namespace offload
	} // namespace llvm