| //===- ol_impl.cpp - Implementation of the new LLVM/Offload API ------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This contains the definitions of the new LLVM/Offload API entry points. See |
| // new-api/API/README.md for more information. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "OffloadImpl.hpp" |
| #include "Helpers.hpp" |
| #include "PluginManager.h" |
| #include "llvm/Support/FormatVariadic.h" |
| #include <OffloadAPI.h> |
| |
| #include <mutex> |
| |
| // TODO: Some plugins expect to be linked into libomptarget which defines these |
| // symbols to implement ompt callbacks. The least invasive workaround here is to |
| // define them in libLLVMOffload as false/null so they are never used. In future |
| // it would be better to allow the plugins to implement callbacks without |
| // pulling in details from libomptarget. |
| #ifdef OMPT_SUPPORT |
| namespace llvm::omp::target { |
| namespace ompt { |
| bool Initialized = false; |
| ompt_get_callback_t lookupCallbackByCode = nullptr; |
| ompt_function_lookup_t lookupCallbackByName = nullptr; |
| } // namespace ompt |
| } // namespace llvm::omp::target |
| #endif |
| |
| using namespace llvm::omp::target; |
| using namespace llvm::omp::target::plugin; |
| |
| // Handle type definitions. Ideally these would be 1:1 with the plugins, but |
| // we add some additional data here for now to avoid churn in the plugin |
| // interface. |
| struct ol_device_impl_t { |
| ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device, |
| ol_platform_handle_t Platform) |
| : DeviceNum(DeviceNum), Device(Device), Platform(Platform) {} |
| int DeviceNum; |
| GenericDeviceTy *Device; |
| ol_platform_handle_t Platform; |
| }; |
| |
| struct ol_platform_impl_t { |
| ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin, |
| std::vector<ol_device_impl_t> Devices, |
| ol_platform_backend_t BackendType) |
| : Plugin(std::move(Plugin)), Devices(Devices), BackendType(BackendType) {} |
| std::unique_ptr<GenericPluginTy> Plugin; |
| std::vector<ol_device_impl_t> Devices; |
| ol_platform_backend_t BackendType; |
| }; |
| |
| struct ol_queue_impl_t { |
| ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device) |
| : AsyncInfo(AsyncInfo), Device(Device) {} |
| __tgt_async_info *AsyncInfo; |
| ol_device_handle_t Device; |
| }; |
| |
| struct ol_event_impl_t { |
| ol_event_impl_t(void *EventInfo, ol_queue_handle_t Queue) |
| : EventInfo(EventInfo), Queue(Queue) {} |
| ~ol_event_impl_t() { (void)Queue->Device->Device->destroyEvent(EventInfo); } |
| void *EventInfo; |
| ol_queue_handle_t Queue; |
| }; |
| |
| struct ol_program_impl_t { |
| ol_program_impl_t(plugin::DeviceImageTy *Image, |
| std::unique_ptr<llvm::MemoryBuffer> ImageData, |
| const __tgt_device_image &DeviceImage) |
| : Image(Image), ImageData(std::move(ImageData)), |
| DeviceImage(DeviceImage) {} |
| plugin::DeviceImageTy *Image; |
| std::unique_ptr<llvm::MemoryBuffer> ImageData; |
| __tgt_device_image DeviceImage; |
| }; |
| |
| namespace llvm { |
| namespace offload { |
| |
| struct AllocInfo { |
| ol_device_handle_t Device; |
| ol_alloc_type_t Type; |
| }; |
| |
| using AllocInfoMapT = DenseMap<void *, AllocInfo>; |
| AllocInfoMapT &allocInfoMap() { |
| static AllocInfoMapT AllocInfoMap{}; |
| return AllocInfoMap; |
| } |
| |
| using PlatformVecT = SmallVector<ol_platform_impl_t, 4>; |
| PlatformVecT &Platforms() { |
| static PlatformVecT Platforms; |
| return Platforms; |
| } |
| |
| ol_device_handle_t HostDevice() { |
| // The host platform is always inserted last |
| return &Platforms().back().Devices[0]; |
| } |
| |
| template <typename HandleT> ol_impl_result_t olDestroy(HandleT Handle) { |
| delete Handle; |
| return OL_SUCCESS; |
| } |
| |
| constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) { |
| if (Name == "amdgpu") { |
| return OL_PLATFORM_BACKEND_AMDGPU; |
| } else if (Name == "cuda") { |
| return OL_PLATFORM_BACKEND_CUDA; |
| } else { |
| return OL_PLATFORM_BACKEND_UNKNOWN; |
| } |
| } |
| |
| // Every plugin exports this method to create an instance of the plugin type. |
| #define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name(); |
| #include "Shared/Targets.def" |
| |
| void initPlugins() { |
| // Attempt to create an instance of each supported plugin. |
| #define PLUGIN_TARGET(Name) \ |
| do { \ |
| Platforms().emplace_back(ol_platform_impl_t{ \ |
| std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), \ |
| {}, \ |
| pluginNameToBackend(#Name)}); \ |
| } while (false); |
| #include "Shared/Targets.def" |
| |
| // Preemptively initialize all devices in the plugin |
| for (auto &Platform : Platforms()) { |
| // Do not use the host plugin - it isn't supported. |
| if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN) |
| continue; |
| auto Err = Platform.Plugin->init(); |
| [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); |
| for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices(); |
| DevNum++) { |
| if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) { |
| Platform.Devices.emplace_back(ol_device_impl_t{ |
| DevNum, &Platform.Plugin->getDevice(DevNum), &Platform}); |
| } |
| } |
| } |
| |
| // Add the special host device |
| auto &HostPlatform = Platforms().emplace_back( |
| ol_platform_impl_t{nullptr, |
| {ol_device_impl_t{-1, nullptr, nullptr}}, |
| OL_PLATFORM_BACKEND_HOST}); |
| HostDevice()->Platform = &HostPlatform; |
| |
| offloadConfig().TracingEnabled = std::getenv("OFFLOAD_TRACE"); |
| offloadConfig().ValidationEnabled = |
| !std::getenv("OFFLOAD_DISABLE_VALIDATION"); |
| } |
| |
| // TODO: We can properly reference count here and manage the resources in a more |
| // clever way |
| ol_impl_result_t olInit_impl() { |
| static std::once_flag InitFlag; |
| std::call_once(InitFlag, initPlugins); |
| |
| return OL_SUCCESS; |
| } |
| ol_impl_result_t olShutDown_impl() { return OL_SUCCESS; } |
| |
| ol_impl_result_t olGetPlatformInfoImplDetail(ol_platform_handle_t Platform, |
| ol_platform_info_t PropName, |
| size_t PropSize, void *PropValue, |
| size_t *PropSizeRet) { |
| ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); |
| bool IsHost = Platform->BackendType == OL_PLATFORM_BACKEND_HOST; |
| |
| switch (PropName) { |
| case OL_PLATFORM_INFO_NAME: |
| return ReturnValue(IsHost ? "Host" : Platform->Plugin->getName()); |
| case OL_PLATFORM_INFO_VENDOR_NAME: |
| // TODO: Implement this |
| return ReturnValue("Unknown platform vendor"); |
| case OL_PLATFORM_INFO_VERSION: { |
| return ReturnValue(formatv("v{0}.{1}.{2}", OL_VERSION_MAJOR, |
| OL_VERSION_MINOR, OL_VERSION_PATCH) |
| .str() |
| .c_str()); |
| } |
| case OL_PLATFORM_INFO_BACKEND: { |
| return ReturnValue(Platform->BackendType); |
| } |
| default: |
| return OL_ERRC_INVALID_ENUMERATION; |
| } |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olGetPlatformInfo_impl(ol_platform_handle_t Platform, |
| ol_platform_info_t PropName, |
| size_t PropSize, void *PropValue) { |
| return olGetPlatformInfoImplDetail(Platform, PropName, PropSize, PropValue, |
| nullptr); |
| } |
| |
| ol_impl_result_t olGetPlatformInfoSize_impl(ol_platform_handle_t Platform, |
| ol_platform_info_t PropName, |
| size_t *PropSizeRet) { |
| return olGetPlatformInfoImplDetail(Platform, PropName, 0, nullptr, |
| PropSizeRet); |
| } |
| |
| ol_impl_result_t olGetDeviceInfoImplDetail(ol_device_handle_t Device, |
| ol_device_info_t PropName, |
| size_t PropSize, void *PropValue, |
| size_t *PropSizeRet) { |
| |
| ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); |
| |
| // Find the info if it exists under any of the given names |
| auto GetInfo = [&](std::vector<std::string> Names) { |
| InfoQueueTy DevInfo; |
| if (auto Err = Device->Device->obtainInfoImpl(DevInfo)) |
| return std::string(""); |
| |
| for (auto Name : Names) { |
| auto InfoKeyMatches = [&](const InfoQueueTy::InfoQueueEntryTy &Info) { |
| return Info.Key == Name; |
| }; |
| auto Item = std::find_if(DevInfo.getQueue().begin(), |
| DevInfo.getQueue().end(), InfoKeyMatches); |
| |
| if (Item != std::end(DevInfo.getQueue())) { |
| return Item->Value; |
| } |
| } |
| |
| return std::string(""); |
| }; |
| |
| switch (PropName) { |
| case OL_DEVICE_INFO_PLATFORM: |
| return ReturnValue(Device->Platform); |
| case OL_DEVICE_INFO_TYPE: |
| return ReturnValue(OL_DEVICE_TYPE_GPU); |
| case OL_DEVICE_INFO_NAME: |
| return ReturnValue(GetInfo({"Device Name"}).c_str()); |
| case OL_DEVICE_INFO_VENDOR: |
| return ReturnValue(GetInfo({"Vendor Name"}).c_str()); |
| case OL_DEVICE_INFO_DRIVER_VERSION: |
| return ReturnValue( |
| GetInfo({"CUDA Driver Version", "HSA Runtime Version"}).c_str()); |
| default: |
| return OL_ERRC_INVALID_ENUMERATION; |
| } |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olGetDeviceInfo_impl(ol_device_handle_t Device, |
| ol_device_info_t PropName, |
| size_t PropSize, void *PropValue) { |
| return olGetDeviceInfoImplDetail(Device, PropName, PropSize, PropValue, |
| nullptr); |
| } |
| |
| ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device, |
| ol_device_info_t PropName, |
| size_t *PropSizeRet) { |
| return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet); |
| } |
| |
| ol_impl_result_t olIterateDevices_impl(ol_device_iterate_cb_t Callback, |
| void *UserData) { |
| for (auto &Platform : Platforms()) { |
| for (auto &Device : Platform.Devices) { |
| if (!Callback(&Device, UserData)) { |
| break; |
| } |
| } |
| } |
| |
| return OL_SUCCESS; |
| } |
| |
| TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) { |
| switch (Type) { |
| case OL_ALLOC_TYPE_DEVICE: |
| return TARGET_ALLOC_DEVICE; |
| case OL_ALLOC_TYPE_HOST: |
| return TARGET_ALLOC_HOST; |
| case OL_ALLOC_TYPE_MANAGED: |
| default: |
| return TARGET_ALLOC_SHARED; |
| } |
| } |
| |
| ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device, |
| ol_alloc_type_t Type, size_t Size, |
| void **AllocationOut) { |
| auto Alloc = |
| Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type)); |
| if (!Alloc) |
| return {OL_ERRC_OUT_OF_RESOURCES, |
| formatv("Could not create allocation on device {0}", Device).str()}; |
| |
| *AllocationOut = *Alloc; |
| allocInfoMap().insert_or_assign(*Alloc, AllocInfo{Device, Type}); |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olMemFree_impl(void *Address) { |
| if (!allocInfoMap().contains(Address)) |
| return {OL_ERRC_INVALID_ARGUMENT, "Address is not a known allocation"}; |
| |
| auto AllocInfo = allocInfoMap().at(Address); |
| auto Device = AllocInfo.Device; |
| auto Type = AllocInfo.Type; |
| |
| auto Res = |
| Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)); |
| if (Res) |
| return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"}; |
| |
| allocInfoMap().erase(Address); |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device, |
| ol_queue_handle_t *Queue) { |
| auto CreatedQueue = std::make_unique<ol_queue_impl_t>(nullptr, Device); |
| auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo)); |
| if (Err) |
| return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"}; |
| |
| *Queue = CreatedQueue.release(); |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olDestroyQueue_impl(ol_queue_handle_t Queue) { |
| return olDestroy(Queue); |
| } |
| |
| ol_impl_result_t olWaitQueue_impl(ol_queue_handle_t Queue) { |
| // Host plugin doesn't have a queue set so it's not safe to call synchronize |
| // on it, but we have nothing to synchronize in that situation anyway. |
| if (Queue->AsyncInfo->Queue) { |
| auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo); |
| if (Err) |
| return {OL_ERRC_INVALID_QUEUE, "The queue failed to synchronize"}; |
| } |
| |
| // Recreate the stream resource so the queue can be reused |
| // TODO: Would be easier for the synchronization to (optionally) not release |
| // it to begin with. |
| auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo); |
| if (Res) |
| return {OL_ERRC_UNKNOWN, "Could not reinitialize the stream resource"}; |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) { |
| auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo); |
| if (Res) |
| return {OL_ERRC_INVALID_EVENT, "The event failed to synchronize"}; |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olDestroyEvent_impl(ol_event_handle_t Event) { |
| return olDestroy(Event); |
| } |
| |
| ol_event_handle_t makeEvent(ol_queue_handle_t Queue) { |
| auto EventImpl = std::make_unique<ol_event_impl_t>(nullptr, Queue); |
| auto Res = Queue->Device->Device->createEvent(&EventImpl->EventInfo); |
| if (Res) |
| return nullptr; |
| |
| Res = Queue->Device->Device->recordEvent(EventImpl->EventInfo, |
| Queue->AsyncInfo); |
| if (Res) |
| return nullptr; |
| |
| return EventImpl.release(); |
| } |
| |
| ol_impl_result_t olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, |
| ol_device_handle_t DstDevice, void *SrcPtr, |
| ol_device_handle_t SrcDevice, size_t Size, |
| ol_event_handle_t *EventOut) { |
| if (DstDevice == HostDevice() && SrcDevice == HostDevice()) { |
| if (!Queue) { |
| std::memcpy(DstPtr, SrcPtr, Size); |
| return OL_SUCCESS; |
| } else { |
| return {OL_ERRC_INVALID_ARGUMENT, |
| "One of DstDevice and SrcDevice must be a non-host device if " |
| "Queue is specified"}; |
| } |
| } |
| |
| // If no queue is given the memcpy will be synchronous |
| auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr; |
| |
| if (DstDevice == HostDevice()) { |
| auto Res = SrcDevice->Device->dataRetrieve(DstPtr, SrcPtr, Size, QueueImpl); |
| if (Res) |
| return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"}; |
| } else if (SrcDevice == HostDevice()) { |
| auto Res = DstDevice->Device->dataSubmit(DstPtr, SrcPtr, Size, QueueImpl); |
| if (Res) |
| return {OL_ERRC_UNKNOWN, "The data submit operation failed"}; |
| } else { |
| auto Res = SrcDevice->Device->dataExchange(SrcPtr, *DstDevice->Device, |
| DstPtr, Size, QueueImpl); |
| if (Res) |
| return {OL_ERRC_UNKNOWN, "The data exchange operation failed"}; |
| } |
| |
| if (EventOut) |
| *EventOut = makeEvent(Queue); |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, |
| const void *ProgData, size_t ProgDataSize, |
| ol_program_handle_t *Program) { |
| // Make a copy of the program binary in case it is released by the caller. |
| auto ImageData = MemoryBuffer::getMemBufferCopy( |
| StringRef(reinterpret_cast<const char *>(ProgData), ProgDataSize)); |
| |
| auto DeviceImage = __tgt_device_image{ |
| const_cast<char *>(ImageData->getBuffer().data()), |
| const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr, |
| nullptr}; |
| |
| ol_program_handle_t Prog = |
| new ol_program_impl_t(nullptr, std::move(ImageData), DeviceImage); |
| |
| auto Res = |
| Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage); |
| if (!Res) { |
| delete Prog; |
| return OL_ERRC_INVALID_VALUE; |
| } |
| |
| Prog->Image = *Res; |
| *Program = Prog; |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t olDestroyProgram_impl(ol_program_handle_t Program) { |
| return olDestroy(Program); |
| } |
| |
| ol_impl_result_t olGetKernel_impl(ol_program_handle_t Program, |
| const char *KernelName, |
| ol_kernel_handle_t *Kernel) { |
| |
| auto &Device = Program->Image->getDevice(); |
| auto KernelImpl = Device.constructKernel(KernelName); |
| if (!KernelImpl) |
| return OL_ERRC_INVALID_KERNEL_NAME; |
| |
| auto Err = KernelImpl->init(Device, *Program->Image); |
| if (Err) |
| return {OL_ERRC_UNKNOWN, "Could not initialize the kernel"}; |
| |
| *Kernel = &*KernelImpl; |
| |
| return OL_SUCCESS; |
| } |
| |
| ol_impl_result_t |
| olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, |
| ol_kernel_handle_t Kernel, const void *ArgumentsData, |
| size_t ArgumentsSize, |
| const ol_kernel_launch_size_args_t *LaunchSizeArgs, |
| ol_event_handle_t *EventOut) { |
| auto *DeviceImpl = Device->Device; |
| if (Queue && Device != Queue->Device) { |
| return {OL_ERRC_INVALID_DEVICE, |
| "Device specified does not match the device of the given queue"}; |
| } |
| |
| auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr; |
| AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl); |
| KernelArgsTy LaunchArgs{}; |
| LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX; |
| LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY; |
| LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ; |
| LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX; |
| LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY; |
| LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ; |
| LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory; |
| |
| KernelLaunchParamsTy Params; |
| Params.Data = const_cast<void *>(ArgumentsData); |
| Params.Size = ArgumentsSize; |
| LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params); |
| // Don't do anything with pointer indirection; use arg data as-is |
| LaunchArgs.Flags.IsCUDA = true; |
| |
| auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel); |
| auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr, |
| LaunchArgs, AsyncInfoWrapper); |
| |
| AsyncInfoWrapper.finalize(Err); |
| if (Err) |
| return {OL_ERRC_UNKNOWN, "Could not finalize the AsyncInfoWrapper"}; |
| |
| if (EventOut) |
| *EventOut = makeEvent(Queue); |
| |
| return OL_SUCCESS; |
| } |
| |
| } // namespace offload |
| } // namespace llvm |