| //===----RTLs/amdgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // RTL NextGen for AMDGPU machine |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include <atomic> |
| #include <cassert> |
| #include <cstddef> |
| #include <deque> |
| #include <mutex> |
| #include <string> |
| #include <system_error> |
| #include <unistd.h> |
| #include <unordered_map> |
| |
| #include "Shared/Debug.h" |
| #include "Shared/Environment.h" |
| #include "Shared/Utils.h" |
| #include "Utils/ELF.h" |
| |
| #include "GlobalHandler.h" |
| #include "OpenMP/OMPT/Callback.h" |
| #include "PluginInterface.h" |
| #include "UtilitiesRTL.h" |
| #include "omptarget.h" |
| |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/BinaryFormat/ELF.h" |
| #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| #include "llvm/Support/Error.h" |
| #include "llvm/Support/FileSystem.h" |
| #include "llvm/Support/MemoryBuffer.h" |
| #include "llvm/Support/Program.h" |
| #include "llvm/Support/raw_ostream.h" |
| |
| #if defined(__has_include) |
| #if __has_include("hsa/hsa.h") |
| #include "hsa/hsa.h" |
| #include "hsa/hsa_ext_amd.h" |
| #elif __has_include("hsa.h") |
| #include "hsa.h" |
| #include "hsa_ext_amd.h" |
| #endif |
| #else |
| #include "hsa/hsa.h" |
| #include "hsa/hsa_ext_amd.h" |
| #endif |
| |
| namespace llvm { |
| namespace omp { |
| namespace target { |
| namespace plugin { |
| |
| /// Forward declarations for all specialized data structures. |
| struct AMDGPUKernelTy; |
| struct AMDGPUDeviceTy; |
| struct AMDGPUPluginTy; |
| struct AMDGPUStreamTy; |
| struct AMDGPUEventTy; |
| struct AMDGPUStreamManagerTy; |
| struct AMDGPUEventManagerTy; |
| struct AMDGPUDeviceImageTy; |
| struct AMDGPUMemoryManagerTy; |
| struct AMDGPUMemoryPoolTy; |
| |
| namespace utils { |
| |
| /// Iterate elements using an HSA iterate function. Do not use this function |
| /// directly but the specialized ones below instead. |
| template <typename ElemTy, typename IterFuncTy, typename CallbackTy> |
| hsa_status_t iterate(IterFuncTy Func, CallbackTy Cb) { |
| auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
| CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| return (*Unwrapped)(Elem); |
| }; |
| return Func(L, static_cast<void *>(&Cb)); |
| } |
| |
| /// Iterate elements using an HSA iterate function passing a parameter. Do not |
| /// use this function directly but the specialized ones below instead. |
| template <typename ElemTy, typename IterFuncTy, typename IterFuncArgTy, |
| typename CallbackTy> |
| hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
| auto L = [](ElemTy Elem, void *Data) -> hsa_status_t { |
| CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| return (*Unwrapped)(Elem); |
| }; |
| return Func(FuncArg, L, static_cast<void *>(&Cb)); |
| } |
| |
| /// Iterate elements using an HSA iterate function passing a parameter. Do not |
| /// use this function directly but the specialized ones below instead. |
| template <typename Elem1Ty, typename Elem2Ty, typename IterFuncTy, |
| typename IterFuncArgTy, typename CallbackTy> |
| hsa_status_t iterate(IterFuncTy Func, IterFuncArgTy FuncArg, CallbackTy Cb) { |
| auto L = [](Elem1Ty Elem1, Elem2Ty Elem2, void *Data) -> hsa_status_t { |
| CallbackTy *Unwrapped = static_cast<CallbackTy *>(Data); |
| return (*Unwrapped)(Elem1, Elem2); |
| }; |
| return Func(FuncArg, L, static_cast<void *>(&Cb)); |
| } |
| |
| /// Iterate agents. |
| template <typename CallbackTy> Error iterateAgents(CallbackTy Callback) { |
| hsa_status_t Status = iterate<hsa_agent_t>(hsa_iterate_agents, Callback); |
| return Plugin::check(Status, "Error in hsa_iterate_agents: %s"); |
| } |
| |
| /// Iterate ISAs of an agent. |
| template <typename CallbackTy> |
| Error iterateAgentISAs(hsa_agent_t Agent, CallbackTy Cb) { |
| hsa_status_t Status = iterate<hsa_isa_t>(hsa_agent_iterate_isas, Agent, Cb); |
| return Plugin::check(Status, "Error in hsa_agent_iterate_isas: %s"); |
| } |
| |
| /// Iterate memory pools of an agent. |
| template <typename CallbackTy> |
| Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) { |
| hsa_status_t Status = iterate<hsa_amd_memory_pool_t>( |
| hsa_amd_agent_iterate_memory_pools, Agent, Cb); |
| return Plugin::check(Status, |
| "Error in hsa_amd_agent_iterate_memory_pools: %s"); |
| } |
| |
| /// Dispatches an asynchronous memory copy. |
| /// Enables different SDMA engines for the dispatch in a round-robin fashion. |
| Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent, |
| const void *Src, hsa_agent_t SrcAgent, size_t Size, |
| uint32_t NumDepSignals, const hsa_signal_t *DepSignals, |
| hsa_signal_t CompletionSignal) { |
| if (!UseMultipleSdmaEngines) { |
| hsa_status_t S = |
| hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size, |
| NumDepSignals, DepSignals, CompletionSignal); |
| return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s"); |
| } |
| |
| // This solution is probably not the best |
| #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \ |
| HSA_AMD_INTERFACE_VERSION_MINOR >= 2) |
| return Plugin::error("Async copy on selected SDMA requires ROCm 5.7"); |
| #else |
| static std::atomic<int> SdmaEngine{1}; |
| |
| // This atomics solution is probably not the best, but should be sufficient |
| // for now. |
| // In a worst case scenario, in which threads read the same value, they will |
| // dispatch to the same SDMA engine. This may result in sub-optimal |
| // performance. However, I think the possibility to be fairly low. |
| int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire); |
| // This call is only avail in ROCm >= 5.7 |
| hsa_status_t S = hsa_amd_memory_async_copy_on_engine( |
| Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals, |
| CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine, |
| /*force_copy_on_sdma=*/true); |
| // Increment to use one of two SDMA engines: 0x1, 0x2 |
| LocalSdmaEngine = (LocalSdmaEngine << 1) % 3; |
| SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed); |
| |
| return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s"); |
| #endif |
| } |
| |
| } // namespace utils |
| |
| /// Utility class representing generic resource references to AMDGPU resources. |
| template <typename ResourceTy> |
| struct AMDGPUResourceRef : public GenericDeviceResourceRef { |
| /// The underlying handle type for resources. |
| using HandleTy = ResourceTy *; |
| |
| /// Create an empty reference to an invalid resource. |
| AMDGPUResourceRef() : Resource(nullptr) {} |
| |
| /// Create a reference to an existing resource. |
| AMDGPUResourceRef(HandleTy Resource) : Resource(Resource) {} |
| |
| virtual ~AMDGPUResourceRef() {} |
| |
| /// Create a new resource and save the reference. The reference must be empty |
| /// before calling to this function. |
| Error create(GenericDeviceTy &Device) override; |
| |
| /// Destroy the referenced resource and invalidate the reference. The |
| /// reference must be to a valid resource before calling to this function. |
| Error destroy(GenericDeviceTy &Device) override { |
| if (!Resource) |
| return Plugin::error("Destroying an invalid resource"); |
| |
| if (auto Err = Resource->deinit()) |
| return Err; |
| |
| delete Resource; |
| |
| Resource = nullptr; |
| return Plugin::success(); |
| } |
| |
| /// Get the underlying resource handle. |
| operator HandleTy() const { return Resource; } |
| |
| private: |
| /// The handle to the actual resource. |
| HandleTy Resource; |
| }; |
| |
| /// Class holding an HSA memory pool. |
| struct AMDGPUMemoryPoolTy { |
| /// Create a memory pool from an HSA memory pool. |
| AMDGPUMemoryPoolTy(hsa_amd_memory_pool_t MemoryPool) |
| : MemoryPool(MemoryPool), GlobalFlags(0) {} |
| |
| /// Initialize the memory pool retrieving its properties. |
| Error init() { |
| if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_SEGMENT, Segment)) |
| return Err; |
| |
| if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Getter of the HSA memory pool. |
| hsa_amd_memory_pool_t get() const { return MemoryPool; } |
| |
| /// Indicate the segment which belongs to. |
| bool isGlobal() const { return (Segment == HSA_AMD_SEGMENT_GLOBAL); } |
| bool isReadOnly() const { return (Segment == HSA_AMD_SEGMENT_READONLY); } |
| bool isPrivate() const { return (Segment == HSA_AMD_SEGMENT_PRIVATE); } |
| bool isGroup() const { return (Segment == HSA_AMD_SEGMENT_GROUP); } |
| |
| /// Indicate if it is fine-grained memory. Valid only for global. |
| bool isFineGrained() const { |
| assert(isGlobal() && "Not global memory"); |
| return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED); |
| } |
| |
| /// Indicate if it is coarse-grained memory. Valid only for global. |
| bool isCoarseGrained() const { |
| assert(isGlobal() && "Not global memory"); |
| return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED); |
| } |
| |
| /// Indicate if it supports storing kernel arguments. Valid only for global. |
| bool supportsKernelArgs() const { |
| assert(isGlobal() && "Not global memory"); |
| return (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT); |
| } |
| |
| /// Allocate memory on the memory pool. |
| Error allocate(size_t Size, void **PtrStorage) { |
| hsa_status_t Status = |
| hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, PtrStorage); |
| return Plugin::check(Status, "Error in hsa_amd_memory_pool_allocate: %s"); |
| } |
| |
| /// Return memory to the memory pool. |
| Error deallocate(void *Ptr) { |
| hsa_status_t Status = hsa_amd_memory_pool_free(Ptr); |
| return Plugin::check(Status, "Error in hsa_amd_memory_pool_free: %s"); |
| } |
| |
| /// Allow the device to access a specific allocation. |
| Error enableAccess(void *Ptr, int64_t Size, |
| const llvm::SmallVector<hsa_agent_t> &Agents) const { |
| #ifdef OMPTARGET_DEBUG |
| for (hsa_agent_t Agent : Agents) { |
| hsa_amd_memory_pool_access_t Access; |
| if (auto Err = |
| getAttr(Agent, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, Access)) |
| return Err; |
| |
| // The agent is not allowed to access the memory pool in any case. Do not |
| // continue because otherwise it result in undefined behavior. |
| if (Access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) |
| return Plugin::error("An agent is not allowed to access a memory pool"); |
| } |
| #endif |
| |
| // We can access but it is disabled by default. Enable the access then. |
| hsa_status_t Status = |
| hsa_amd_agents_allow_access(Agents.size(), Agents.data(), nullptr, Ptr); |
| return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); |
| } |
| |
| /// Get attribute from the memory pool. |
| template <typename Ty> |
| Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
| hsa_status_t Status; |
| Status = hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
| return Plugin::check(Status, "Error in hsa_amd_memory_pool_get_info: %s"); |
| } |
| |
| template <typename Ty> |
| hsa_status_t getAttrRaw(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { |
| return hsa_amd_memory_pool_get_info(MemoryPool, Kind, &Value); |
| } |
| |
| /// Get attribute from the memory pool relating to an agent. |
| template <typename Ty> |
| Error getAttr(hsa_agent_t Agent, hsa_amd_agent_memory_pool_info_t Kind, |
| Ty &Value) const { |
| hsa_status_t Status; |
| Status = |
| hsa_amd_agent_memory_pool_get_info(Agent, MemoryPool, Kind, &Value); |
| return Plugin::check(Status, |
| "Error in hsa_amd_agent_memory_pool_get_info: %s"); |
| } |
| |
| private: |
| /// The HSA memory pool. |
| hsa_amd_memory_pool_t MemoryPool; |
| |
| /// The segment where the memory pool belongs to. |
| hsa_amd_segment_t Segment; |
| |
| /// The global flags of memory pool. Only valid if the memory pool belongs to |
| /// the global segment. |
| uint32_t GlobalFlags; |
| }; |
| |
| /// Class that implements a memory manager that gets memory from a specific |
| /// memory pool. |
| struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { |
| |
| /// Create an empty memory manager. |
| AMDGPUMemoryManagerTy() : MemoryPool(nullptr), MemoryManager(nullptr) {} |
| |
| /// Initialize the memory manager from a memory pool. |
| Error init(AMDGPUMemoryPoolTy &MemoryPool) { |
| const uint32_t Threshold = 1 << 30; |
| this->MemoryManager = new MemoryManagerTy(*this, Threshold); |
| this->MemoryPool = &MemoryPool; |
| return Plugin::success(); |
| } |
| |
| /// Deinitialize the memory manager and free its allocations. |
| Error deinit() { |
| assert(MemoryManager && "Invalid memory manager"); |
| |
| // Delete and invalidate the memory manager. At this point, the memory |
| // manager will deallocate all its allocations. |
| delete MemoryManager; |
| MemoryManager = nullptr; |
| |
| return Plugin::success(); |
| } |
| |
| /// Reuse or allocate memory through the memory manager. |
| Error allocate(size_t Size, void **PtrStorage) { |
| assert(MemoryManager && "Invalid memory manager"); |
| assert(PtrStorage && "Invalid pointer storage"); |
| |
| *PtrStorage = MemoryManager->allocate(Size, nullptr); |
| if (*PtrStorage == nullptr) |
| return Plugin::error("Failure to allocate from AMDGPU memory manager"); |
| |
| return Plugin::success(); |
| } |
| |
| /// Release an allocation to be reused. |
| Error deallocate(void *Ptr) { |
| assert(Ptr && "Invalid pointer"); |
| |
| if (MemoryManager->free(Ptr)) |
| return Plugin::error("Failure to deallocate from AMDGPU memory manager"); |
| |
| return Plugin::success(); |
| } |
| |
| private: |
| /// Allocation callback that will be called once the memory manager does not |
| /// have more previously allocated buffers. |
| void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; |
| |
| /// Deallocation callack that will be called by the memory manager. |
| int free(void *TgtPtr, TargetAllocTy Kind) override { |
| if (auto Err = MemoryPool->deallocate(TgtPtr)) { |
| consumeError(std::move(Err)); |
| return OFFLOAD_FAIL; |
| } |
| return OFFLOAD_SUCCESS; |
| } |
| |
| /// The memory pool used to allocate memory. |
| AMDGPUMemoryPoolTy *MemoryPool; |
| |
| /// Reference to the actual memory manager. |
| MemoryManagerTy *MemoryManager; |
| }; |
| |
| /// Class implementing the AMDGPU device images' properties. |
| struct AMDGPUDeviceImageTy : public DeviceImageTy { |
| /// Create the AMDGPU image with the id and the target image pointer. |
| AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) |
| : DeviceImageTy(ImageId, TgtImage) {} |
| |
| /// Prepare and load the executable corresponding to the image. |
| Error loadExecutable(const AMDGPUDeviceTy &Device); |
| |
| /// Unload the executable. |
| Error unloadExecutable() { |
| hsa_status_t Status = hsa_executable_destroy(Executable); |
| if (auto Err = Plugin::check(Status, "Error in hsa_executable_destroy: %s")) |
| return Err; |
| |
| Status = hsa_code_object_destroy(CodeObject); |
| return Plugin::check(Status, "Error in hsa_code_object_destroy: %s"); |
| } |
| |
| /// Get the executable. |
| hsa_executable_t getExecutable() const { return Executable; } |
| |
| /// Get to Code Object Version of the ELF |
| uint16_t getELFABIVersion() const { return ELFABIVersion; } |
| |
| /// Find an HSA device symbol by its name on the executable. |
| Expected<hsa_executable_symbol_t> |
| findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; |
| |
| /// Get additional info for kernel, e.g., register spill counts |
| std::optional<utils::KernelMetaDataTy> |
| getKernelInfo(StringRef Identifier) const { |
| auto It = KernelInfoMap.find(Identifier); |
| |
| if (It == KernelInfoMap.end()) |
| return {}; |
| |
| return It->second; |
| } |
| |
| private: |
| /// The exectuable loaded on the agent. |
| hsa_executable_t Executable; |
| hsa_code_object_t CodeObject; |
| StringMap<utils::KernelMetaDataTy> KernelInfoMap; |
| uint16_t ELFABIVersion; |
| }; |
| |
| /// Class implementing the AMDGPU kernel functionalities which derives from the |
| /// generic kernel class. |
| struct AMDGPUKernelTy : public GenericKernelTy { |
| /// Create an AMDGPU kernel with a name and an execution mode. |
| AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {} |
| |
| /// Initialize the AMDGPU kernel. |
| Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { |
| AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
| |
| // Kernel symbols have a ".kd" suffix. |
| std::string KernelName(getName()); |
| KernelName += ".kd"; |
| |
| // Find the symbol on the device executable. |
| auto SymbolOrErr = AMDImage.findDeviceSymbol(Device, KernelName); |
| if (!SymbolOrErr) |
| return SymbolOrErr.takeError(); |
| |
| hsa_executable_symbol_t Symbol = *SymbolOrErr; |
| hsa_symbol_kind_t SymbolType; |
| hsa_status_t Status; |
| |
| // Retrieve different properties of the kernel symbol. |
| std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
| {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; |
| |
| for (auto &Info : RequiredInfos) { |
| Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
| if (auto Err = Plugin::check( |
| Status, "Error in hsa_executable_symbol_get_info: %s")) |
| return Err; |
| } |
| |
| // Make sure it is a kernel symbol. |
| if (SymbolType != HSA_SYMBOL_KIND_KERNEL) |
| return Plugin::error("Symbol %s is not a kernel function"); |
| |
| // TODO: Read the kernel descriptor for the max threads per block. May be |
| // read from the image. |
| |
| ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); |
| DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); |
| |
| // Get additional kernel info read from image |
| KernelInfo = AMDImage.getKernelInfo(getName()); |
| if (!KernelInfo.has_value()) |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(), |
| "Could not read extra information for kernel %s.", getName()); |
| |
| return Plugin::success(); |
| } |
| |
| /// Launch the AMDGPU kernel function. |
| Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, |
| uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) const override; |
| |
| /// Print more elaborate kernel launch info for AMDGPU |
| Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
| KernelArgsTy &KernelArgs, uint32_t NumThreads, |
| uint64_t NumBlocks) const override; |
| |
| /// Get group and private segment kernel size. |
| uint32_t getGroupSize() const { return GroupSize; } |
| uint32_t getPrivateSize() const { return PrivateSize; } |
| |
| /// Get the HSA kernel object representing the kernel function. |
| uint64_t getKernelObject() const { return KernelObject; } |
| |
| /// Get the size of implicitargs based on the code object version |
| /// @return 56 for cov4 and 256 for cov5 |
| uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } |
| |
| /// Indicates whether or not we need to set up our own private segment size. |
| bool usesDynamicStack() const { return DynamicStack; } |
| |
| private: |
| /// The kernel object to execute. |
| uint64_t KernelObject; |
| |
| /// The args, group and private segments sizes required by a kernel instance. |
| uint32_t ArgsSize; |
| uint32_t GroupSize; |
| uint32_t PrivateSize; |
| bool DynamicStack; |
| |
| /// The size of implicit kernel arguments. |
| uint32_t ImplicitArgsSize; |
| |
| /// Additional Info for the AMD GPU Kernel |
| std::optional<utils::KernelMetaDataTy> KernelInfo; |
| }; |
| |
| /// Class representing an HSA signal. Signals are used to define dependencies |
| /// between asynchronous operations: kernel launches and memory transfers. |
| struct AMDGPUSignalTy { |
| /// Create an empty signal. |
| AMDGPUSignalTy() : HSASignal({0}), UseCount() {} |
| AMDGPUSignalTy(AMDGPUDeviceTy &Device) : HSASignal({0}), UseCount() {} |
| |
| /// Initialize the signal with an initial value. |
| Error init(uint32_t InitialValue = 1) { |
| hsa_status_t Status = |
| hsa_amd_signal_create(InitialValue, 0, nullptr, 0, &HSASignal); |
| return Plugin::check(Status, "Error in hsa_signal_create: %s"); |
| } |
| |
| /// Deinitialize the signal. |
| Error deinit() { |
| hsa_status_t Status = hsa_signal_destroy(HSASignal); |
| return Plugin::check(Status, "Error in hsa_signal_destroy: %s"); |
| } |
| |
| /// Wait until the signal gets a zero value. |
| Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr, |
| GenericDeviceTy *Device = nullptr) const { |
| if (ActiveTimeout && !RPCServer) { |
| hsa_signal_value_t Got = 1; |
| Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
| ActiveTimeout, HSA_WAIT_STATE_ACTIVE); |
| if (Got == 0) |
| return Plugin::success(); |
| } |
| |
| // If there is an RPC device attached to this stream we run it as a server. |
| uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX; |
| auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; |
| while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0, |
| Timeout, WaitState) != 0) { |
| if (RPCServer && Device) |
| if (auto Err = RPCServer->runServer(*Device)) |
| return Err; |
| } |
| return Plugin::success(); |
| } |
| |
| /// Load the value on the signal. |
| hsa_signal_value_t load() const { |
| return hsa_signal_load_scacquire(HSASignal); |
| } |
| |
| /// Signal decrementing by one. |
| void signal() { |
| assert(load() > 0 && "Invalid signal value"); |
| hsa_signal_subtract_screlease(HSASignal, 1); |
| } |
| |
| /// Reset the signal value before reusing the signal. Do not call this |
| /// function if the signal is being currently used by any watcher, such as a |
| /// plugin thread or the HSA runtime. |
| void reset() { hsa_signal_store_screlease(HSASignal, 1); } |
| |
| /// Increase the number of concurrent uses. |
| void increaseUseCount() { UseCount.increase(); } |
| |
| /// Decrease the number of concurrent uses and return whether was the last. |
| bool decreaseUseCount() { return UseCount.decrease(); } |
| |
| hsa_signal_t get() const { return HSASignal; } |
| |
| private: |
| /// The underlying HSA signal. |
| hsa_signal_t HSASignal; |
| |
| /// Reference counter for tracking the concurrent use count. This is mainly |
| /// used for knowing how many streams are using the signal. |
| RefCountTy<> UseCount; |
| }; |
| |
| /// Classes for holding AMDGPU signals and managing signals. |
| using AMDGPUSignalRef = AMDGPUResourceRef<AMDGPUSignalTy>; |
| using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>; |
| |
| /// Class holding an HSA queue to submit kernel and barrier packets. |
| struct AMDGPUQueueTy { |
| /// Create an empty queue. |
| AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {} |
| |
| /// Lazily initialize a new queue belonging to a specific agent. |
| Error init(hsa_agent_t Agent, int32_t QueueSize) { |
| if (Queue) |
| return Plugin::success(); |
| hsa_status_t Status = |
| hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError, |
| nullptr, UINT32_MAX, UINT32_MAX, &Queue); |
| return Plugin::check(Status, "Error in hsa_queue_create: %s"); |
| } |
| |
| /// Deinitialize the queue and destroy its resources. |
| Error deinit() { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| if (!Queue) |
| return Plugin::success(); |
| hsa_status_t Status = hsa_queue_destroy(Queue); |
| return Plugin::check(Status, "Error in hsa_queue_destroy: %s"); |
| } |
| |
| /// Returns the number of streams, this queue is currently assigned to. |
| bool getUserCount() const { return NumUsers; } |
| |
| /// Returns if the underlying HSA queue is initialized. |
| bool isInitialized() { return Queue != nullptr; } |
| |
| /// Decrement user count of the queue object. |
| void removeUser() { --NumUsers; } |
| |
| /// Increase user count of the queue object. |
| void addUser() { ++NumUsers; } |
| |
| /// Push a kernel launch to the queue. The kernel launch requires an output |
| /// signal and can define an optional input signal (nullptr if none). |
| Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
| uint32_t NumThreads, uint64_t NumBlocks, |
| uint32_t GroupSize, uint64_t StackSize, |
| AMDGPUSignalTy *OutputSignal, |
| AMDGPUSignalTy *InputSignal) { |
| assert(OutputSignal && "Invalid kernel output signal"); |
| |
| // Lock the queue during the packet publishing process. Notice this blocks |
| // the addition of other packets to the queue. The following piece of code |
| // should be lightweight; do not block the thread, allocate memory, etc. |
| std::lock_guard<std::mutex> Lock(Mutex); |
| assert(Queue && "Interacted with a non-initialized queue!"); |
| |
| // Avoid defining the input dependency if already satisfied. |
| if (InputSignal && !InputSignal->load()) |
| InputSignal = nullptr; |
| |
| // Add a barrier packet before the kernel packet in case there is a pending |
| // preceding operation. The barrier packet will delay the processing of |
| // subsequent queue's packets until the barrier input signal are satisfied. |
| // No need output signal needed because the dependency is already guaranteed |
| // by the queue barrier itself. |
| if (InputSignal) |
| if (auto Err = pushBarrierImpl(nullptr, InputSignal)) |
| return Err; |
| |
| // Now prepare the kernel packet. |
| uint64_t PacketId; |
| hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId); |
| assert(Packet && "Invalid packet"); |
| |
| // The first 32 bits of the packet are written after the other fields |
| uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
| Packet->workgroup_size_x = NumThreads; |
| Packet->workgroup_size_y = 1; |
| Packet->workgroup_size_z = 1; |
| Packet->reserved0 = 0; |
| Packet->grid_size_x = NumBlocks * NumThreads; |
| Packet->grid_size_y = 1; |
| Packet->grid_size_z = 1; |
| Packet->private_segment_size = |
| Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); |
| Packet->group_segment_size = GroupSize; |
| Packet->kernel_object = Kernel.getKernelObject(); |
| Packet->kernarg_address = KernelArgs; |
| Packet->reserved2 = 0; |
| Packet->completion_signal = OutputSignal->get(); |
| |
| // Publish the packet. Do not modify the packet after this point. |
| publishKernelPacket(PacketId, Setup, Packet); |
| |
| return Plugin::success(); |
| } |
| |
| /// Push a barrier packet that will wait up to two input signals. All signals |
| /// are optional (nullptr if none). |
| Error pushBarrier(AMDGPUSignalTy *OutputSignal, |
| const AMDGPUSignalTy *InputSignal1, |
| const AMDGPUSignalTy *InputSignal2) { |
| // Lock the queue during the packet publishing process. |
| std::lock_guard<std::mutex> Lock(Mutex); |
| assert(Queue && "Interacted with a non-initialized queue!"); |
| |
| // Push the barrier with the lock acquired. |
| return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); |
| } |
| |
| private: |
| /// Push a barrier packet that will wait up to two input signals. Assumes the |
| /// the queue lock is acquired. |
| Error pushBarrierImpl(AMDGPUSignalTy *OutputSignal, |
| const AMDGPUSignalTy *InputSignal1, |
| const AMDGPUSignalTy *InputSignal2 = nullptr) { |
| // Add a queue barrier waiting on both the other stream's operation and the |
| // last operation on the current stream (if any). |
| uint64_t PacketId; |
| hsa_barrier_and_packet_t *Packet = |
| (hsa_barrier_and_packet_t *)acquirePacket(PacketId); |
| assert(Packet && "Invalid packet"); |
| |
| Packet->reserved0 = 0; |
| Packet->reserved1 = 0; |
| Packet->dep_signal[0] = {0}; |
| Packet->dep_signal[1] = {0}; |
| Packet->dep_signal[2] = {0}; |
| Packet->dep_signal[3] = {0}; |
| Packet->dep_signal[4] = {0}; |
| Packet->reserved2 = 0; |
| Packet->completion_signal = {0}; |
| |
| // Set input and output dependencies if needed. |
| if (OutputSignal) |
| Packet->completion_signal = OutputSignal->get(); |
| if (InputSignal1) |
| Packet->dep_signal[0] = InputSignal1->get(); |
| if (InputSignal2) |
| Packet->dep_signal[1] = InputSignal2->get(); |
| |
| // Publish the packet. Do not modify the packet after this point. |
| publishBarrierPacket(PacketId, Packet); |
| |
| return Plugin::success(); |
| } |
| |
| /// Acquire a packet from the queue. This call may block the thread if there |
| /// is no space in the underlying HSA queue. It may need to wait until the HSA |
| /// runtime processes some packets. Assumes the queue lock is acquired. |
| hsa_kernel_dispatch_packet_t *acquirePacket(uint64_t &PacketId) { |
| // Increase the queue index with relaxed memory order. Notice this will need |
| // another subsequent atomic operation with acquire order. |
| PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
| |
| // Wait for the package to be available. Notice the atomic operation uses |
| // the acquire memory order. |
| while (PacketId - hsa_queue_load_read_index_scacquire(Queue) >= Queue->size) |
| ; |
| |
| // Return the packet reference. |
| const uint32_t Mask = Queue->size - 1; // The size is a power of 2. |
| return (hsa_kernel_dispatch_packet_t *)Queue->base_address + |
| (PacketId & Mask); |
| } |
| |
| /// Publish the kernel packet so that the HSA runtime can start processing |
| /// the kernel launch. Do not modify the packet once this function is called. |
| /// Assumes the queue lock is acquired. |
| void publishKernelPacket(uint64_t PacketId, uint16_t Setup, |
| hsa_kernel_dispatch_packet_t *Packet) { |
| uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
| |
| uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
| Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
| Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
| |
| // Publish the packet. Do not modify the package after this point. |
| uint32_t HeaderWord = Header | (Setup << 16u); |
| __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
| |
| // Signal the doorbell about the published packet. |
| hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
| } |
| |
| /// Publish the barrier packet so that the HSA runtime can start processing |
| /// the barrier. Next packets in the queue will not be processed until all |
| /// barrier dependencies (signals) are satisfied. Assumes the queue is locked |
| void publishBarrierPacket(uint64_t PacketId, |
| hsa_barrier_and_packet_t *Packet) { |
| uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet); |
| uint16_t Setup = 0; |
| uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; |
| Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
| Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
| |
| // Publish the packet. Do not modify the package after this point. |
| uint32_t HeaderWord = Header | (Setup << 16u); |
| __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE); |
| |
| // Signal the doorbell about the published packet. |
| hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
| } |
| |
| /// Callack that will be called when an error is detected on the HSA queue. |
| static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) { |
| auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source); |
| FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data()); |
| } |
| |
| /// The HSA queue. |
| hsa_queue_t *Queue; |
| |
| /// Mutex to protect the acquiring and publishing of packets. For the moment, |
| /// we need this mutex to prevent publishing packets that are not ready to be |
| /// published in a multi-thread scenario. Without a queue lock, a thread T1 |
| /// could acquire packet P and thread T2 acquire packet P+1. Thread T2 could |
| /// publish its packet P+1 (signaling the queue's doorbell) before packet P |
| /// from T1 is ready to be processed. That scenario should be invalid. Thus, |
| /// we use the following mutex to make packet acquiring and publishing atomic. |
| /// TODO: There are other more advanced approaches to avoid this mutex using |
| /// atomic operations. We can further investigate it if this is a bottleneck. |
| std::mutex Mutex; |
| |
| /// The number of streams, this queue is currently assigned to. A queue is |
| /// considered idle when this is zero, otherwise: busy. |
| uint32_t NumUsers; |
| }; |
| |
| /// Struct that implements a stream of asynchronous operations for AMDGPU |
| /// devices. This class relies on signals to implement streams and define the |
| /// dependencies between asynchronous operations. |
| struct AMDGPUStreamTy { |
| private: |
| /// Utility struct holding arguments for async H2H memory copies. |
| struct MemcpyArgsTy { |
| void *Dst; |
| const void *Src; |
| size_t Size; |
| }; |
| |
| /// Utility struct holding arguments for freeing buffers to memory managers. |
| struct ReleaseBufferArgsTy { |
| void *Buffer; |
| AMDGPUMemoryManagerTy *MemoryManager; |
| }; |
| |
| /// Utility struct holding arguments for releasing signals to signal managers. |
| struct ReleaseSignalArgsTy { |
| AMDGPUSignalTy *Signal; |
| AMDGPUSignalManagerTy *SignalManager; |
| }; |
| |
| /// The stream is composed of N stream's slots. The struct below represents |
| /// the fields of each slot. Each slot has a signal and an optional action |
| /// function. When appending an HSA asynchronous operation to the stream, one |
| /// slot is consumed and used to store the operation's information. The |
| /// operation's output signal is set to the consumed slot's signal. If there |
| /// is a previous asynchronous operation on the previous slot, the HSA async |
| /// operation's input signal is set to the signal of the previous slot. This |
| /// way, we obtain a chain of dependant async operations. The action is a |
| /// function that will be executed eventually after the operation is |
| /// completed, e.g., for releasing a buffer. |
| struct StreamSlotTy { |
| /// The output signal of the stream operation. May be used by the subsequent |
| /// operation as input signal. |
| AMDGPUSignalTy *Signal; |
| |
| /// The action that must be performed after the operation's completion. Set |
| /// to nullptr when there is no action to perform. |
| Error (*ActionFunction)(void *); |
| |
| /// Space for the action's arguments. A pointer to these arguments is passed |
| /// to the action function. Notice the space of arguments is limited. |
| union { |
| MemcpyArgsTy MemcpyArgs; |
| ReleaseBufferArgsTy ReleaseBufferArgs; |
| ReleaseSignalArgsTy ReleaseSignalArgs; |
| } ActionArgs; |
| |
| /// Create an empty slot. |
| StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {} |
| |
| /// Schedule a host memory copy action on the slot. |
| Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) { |
| ActionFunction = memcpyAction; |
| ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size}; |
| return Plugin::success(); |
| } |
| |
| /// Schedule a release buffer action on the slot. |
| Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) { |
| ActionFunction = releaseBufferAction; |
| ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager}; |
| return Plugin::success(); |
| } |
| |
| /// Schedule a signal release action on the slot. |
| Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease, |
| AMDGPUSignalManagerTy *SignalManager) { |
| ActionFunction = releaseSignalAction; |
| ActionArgs.ReleaseSignalArgs = |
| ReleaseSignalArgsTy{SignalToRelease, SignalManager}; |
| return Plugin::success(); |
| } |
| |
| // Perform the action if needed. |
| Error performAction() { |
| if (!ActionFunction) |
| return Plugin::success(); |
| |
| // Perform the action. |
| if (ActionFunction == memcpyAction) { |
| if (auto Err = memcpyAction(&ActionArgs)) |
| return Err; |
| } else if (ActionFunction == releaseBufferAction) { |
| if (auto Err = releaseBufferAction(&ActionArgs)) |
| return Err; |
| } else if (ActionFunction == releaseSignalAction) { |
| if (auto Err = releaseSignalAction(&ActionArgs)) |
| return Err; |
| } else { |
| return Plugin::error("Unknown action function!"); |
| } |
| |
| // Invalidate the action. |
| ActionFunction = nullptr; |
| |
| return Plugin::success(); |
| } |
| }; |
| |
| /// The device agent where the stream was created. |
| hsa_agent_t Agent; |
| |
| /// The queue that the stream uses to launch kernels. |
| AMDGPUQueueTy *Queue; |
| |
| /// The manager of signals to reuse signals. |
| AMDGPUSignalManagerTy &SignalManager; |
| |
| /// A reference to the associated device. |
| GenericDeviceTy &Device; |
| |
| /// Array of stream slots. Use std::deque because it can dynamically grow |
| /// without invalidating the already inserted elements. For instance, the |
| /// std::vector may invalidate the elements by reallocating the internal |
| /// array if there is not enough space on new insertions. |
| std::deque<StreamSlotTy> Slots; |
| |
| /// The next available slot on the queue. This is reset to zero each time the |
| /// stream is synchronized. It also indicates the current number of consumed |
| /// slots at a given time. |
| uint32_t NextSlot; |
| |
| /// The synchronization id. This number is increased each time the stream is |
| /// synchronized. It is useful to detect if an AMDGPUEventTy points to an |
| /// operation that was already finalized in a previous stream sycnhronize. |
| uint32_t SyncCycle; |
| |
| /// A pointer associated with an RPC server running on the given device. If |
| /// RPC is not being used this will be a null pointer. Otherwise, this |
| /// indicates that an RPC server is expected to be run on this stream. |
| RPCServerTy *RPCServer; |
| |
| /// Mutex to protect stream's management. |
| mutable std::mutex Mutex; |
| |
| /// Timeout hint for HSA actively waiting for signal value to change |
| const uint64_t StreamBusyWaitMicroseconds; |
| |
| /// Indicate to spread data transfers across all avilable SDMAs |
| bool UseMultipleSdmaEngines; |
| |
| /// Return the current number of asychronous operations on the stream. |
| uint32_t size() const { return NextSlot; } |
| |
| /// Return the last valid slot on the stream. |
| uint32_t last() const { return size() - 1; } |
| |
| /// Consume one slot from the stream. Since the stream uses signals on demand |
| /// and releases them once the slot is no longer used, the function requires |
| /// an idle signal for the new consumed slot. |
| std::pair<uint32_t, AMDGPUSignalTy *> consume(AMDGPUSignalTy *OutputSignal) { |
| // Double the stream size if needed. Since we use std::deque, this operation |
| // does not invalidate the already added slots. |
| if (Slots.size() == NextSlot) |
| Slots.resize(Slots.size() * 2); |
| |
| // Update the next available slot and the stream size. |
| uint32_t Curr = NextSlot++; |
| |
| // Retrieve the input signal, if any, of the current operation. |
| AMDGPUSignalTy *InputSignal = (Curr > 0) ? Slots[Curr - 1].Signal : nullptr; |
| |
| // Set the output signal of the current slot. |
| Slots[Curr].Signal = OutputSignal; |
| |
| return std::make_pair(Curr, InputSignal); |
| } |
| |
| /// Complete all pending post actions and reset the stream after synchronizing |
| /// or positively querying the stream. |
| Error complete() { |
| for (uint32_t Slot = 0; Slot < NextSlot; ++Slot) { |
| // Take the post action of the operation if any. |
| if (auto Err = Slots[Slot].performAction()) |
| return Err; |
| |
| // Release the slot's signal if possible. Otherwise, another user will. |
| if (Slots[Slot].Signal->decreaseUseCount()) |
| if (auto Err = SignalManager.returnResource(Slots[Slot].Signal)) |
| return Err; |
| |
| Slots[Slot].Signal = nullptr; |
| } |
| |
| // Reset the stream slots to zero. |
| NextSlot = 0; |
| |
| // Increase the synchronization id since the stream completed a sync cycle. |
| SyncCycle += 1; |
| |
| return Plugin::success(); |
| } |
| |
| /// Make the current stream wait on a specific operation of another stream. |
| /// The idea is to make the current stream waiting on two signals: 1) the last |
| /// signal of the current stream, and 2) the last signal of the other stream. |
| /// Use a barrier packet with two input signals. |
| Error waitOnStreamOperation(AMDGPUStreamTy &OtherStream, uint32_t Slot) { |
| if (Queue == nullptr) |
| return Plugin::error("Target queue was nullptr"); |
| |
| /// The signal that we must wait from the other stream. |
| AMDGPUSignalTy *OtherSignal = OtherStream.Slots[Slot].Signal; |
| |
| // Prevent the release of the other stream's signal. |
| OtherSignal->increaseUseCount(); |
| |
| // Retrieve an available signal for the operation's output. |
| AMDGPUSignalTy *OutputSignal = nullptr; |
| if (auto Err = SignalManager.getResource(OutputSignal)) |
| return Err; |
| OutputSignal->reset(); |
| OutputSignal->increaseUseCount(); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignal); |
| |
| // Setup the post action to release the signal. |
| if (auto Err = Slots[Curr].schedReleaseSignal(OtherSignal, &SignalManager)) |
| return Err; |
| |
| // Push a barrier into the queue with both input signals. |
| return Queue->pushBarrier(OutputSignal, InputSignal, OtherSignal); |
| } |
| |
| /// Callback for running a specific asynchronous operation. This callback is |
| /// used for hsa_amd_signal_async_handler. The argument is the operation that |
| /// should be executed. Notice we use the post action mechanism to codify the |
| /// asynchronous operation. |
| static bool asyncActionCallback(hsa_signal_value_t Value, void *Args) { |
| StreamSlotTy *Slot = reinterpret_cast<StreamSlotTy *>(Args); |
| assert(Slot && "Invalid slot"); |
| assert(Slot->Signal && "Invalid signal"); |
| |
| // This thread is outside the stream mutex. Make sure the thread sees the |
| // changes on the slot. |
| std::atomic_thread_fence(std::memory_order_acquire); |
| |
| // Peform the operation. |
| if (auto Err = Slot->performAction()) |
| FATAL_MESSAGE(1, "Error peforming post action: %s", |
| toString(std::move(Err)).data()); |
| |
| // Signal the output signal to notify the asycnhronous operation finalized. |
| Slot->Signal->signal(); |
| |
| // Unregister callback. |
| return false; |
| } |
| |
| // Callback for host-to-host memory copies. This is an asynchronous action. |
| static Error memcpyAction(void *Data) { |
| MemcpyArgsTy *Args = reinterpret_cast<MemcpyArgsTy *>(Data); |
| assert(Args && "Invalid arguments"); |
| assert(Args->Dst && "Invalid destination buffer"); |
| assert(Args->Src && "Invalid source buffer"); |
| |
| std::memcpy(Args->Dst, Args->Src, Args->Size); |
| |
| return Plugin::success(); |
| } |
| |
| /// Releasing a memory buffer to a memory manager. This is a post completion |
| /// action. There are two kinds of memory buffers: |
| /// 1. For kernel arguments. This buffer can be freed after receiving the |
| /// kernel completion signal. |
| /// 2. For H2D tranfers that need pinned memory space for staging. This |
| /// buffer can be freed after receiving the transfer completion signal. |
| /// 3. For D2H tranfers that need pinned memory space for staging. This |
| /// buffer cannot be freed after receiving the transfer completion signal |
| /// because of the following asynchronous H2H callback. |
| /// For this reason, This action can only be taken at |
| /// AMDGPUStreamTy::complete() |
| /// Because of the case 3, all releaseBufferActions are taken at |
| /// AMDGPUStreamTy::complete() in the current implementation. |
| static Error releaseBufferAction(void *Data) { |
| ReleaseBufferArgsTy *Args = reinterpret_cast<ReleaseBufferArgsTy *>(Data); |
| assert(Args && "Invalid arguments"); |
| assert(Args->MemoryManager && "Invalid memory manager"); |
| assert(Args->Buffer && "Invalid buffer"); |
| |
| // Release the allocation to the memory manager. |
| return Args->MemoryManager->deallocate(Args->Buffer); |
| } |
| |
| /// Releasing a signal object back to SignalManager. This is a post completion |
| /// action. This action can only be taken at AMDGPUStreamTy::complete() |
| static Error releaseSignalAction(void *Data) { |
| ReleaseSignalArgsTy *Args = reinterpret_cast<ReleaseSignalArgsTy *>(Data); |
| assert(Args && "Invalid arguments"); |
| assert(Args->Signal && "Invalid signal"); |
| assert(Args->SignalManager && "Invalid signal manager"); |
| |
| // Release the signal if needed. |
| if (Args->Signal->decreaseUseCount()) |
| if (auto Err = Args->SignalManager->returnResource(Args->Signal)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| public: |
| /// Create an empty stream associated with a specific device. |
| AMDGPUStreamTy(AMDGPUDeviceTy &Device); |
| |
| /// Intialize the stream's signals. |
| Error init() { return Plugin::success(); } |
| |
| /// Deinitialize the stream's signals. |
| Error deinit() { return Plugin::success(); } |
| |
| /// Attach an RPC server to this stream. |
| void setRPCServer(RPCServerTy *Server) { RPCServer = Server; } |
| |
| /// Push a asynchronous kernel to the stream. The kernel arguments must be |
| /// placed in a special allocation for kernel args and must keep alive until |
| /// the kernel finalizes. Once the kernel is finished, the stream will release |
| /// the kernel args buffer to the specified memory manager. |
| Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, |
| uint32_t NumThreads, uint64_t NumBlocks, |
| uint32_t GroupSize, uint64_t StackSize, |
| AMDGPUMemoryManagerTy &MemoryManager) { |
| if (Queue == nullptr) |
| return Plugin::error("Target queue was nullptr"); |
| |
| // Retrieve an available signal for the operation's output. |
| AMDGPUSignalTy *OutputSignal = nullptr; |
| if (auto Err = SignalManager.getResource(OutputSignal)) |
| return Err; |
| OutputSignal->reset(); |
| OutputSignal->increaseUseCount(); |
| |
| std::lock_guard<std::mutex> StreamLock(Mutex); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignal); |
| |
| // Setup the post action to release the kernel args buffer. |
| if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager)) |
| return Err; |
| |
| // Push the kernel with the output signal and an input signal (optional) |
| return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks, |
| GroupSize, StackSize, OutputSignal, |
| InputSignal); |
| } |
| |
| /// Push an asynchronous memory copy between pinned memory buffers. |
| Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, |
| uint64_t CopySize) { |
| // Retrieve an available signal for the operation's output. |
| AMDGPUSignalTy *OutputSignal = nullptr; |
| if (auto Err = SignalManager.getResource(OutputSignal)) |
| return Err; |
| OutputSignal->reset(); |
| OutputSignal->increaseUseCount(); |
| |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignal); |
| |
| // Avoid defining the input dependency if already satisfied. |
| if (InputSignal && !InputSignal->load()) |
| InputSignal = nullptr; |
| |
| // Issue the async memory copy. |
| if (InputSignal) { |
| hsa_signal_t InputSignalRaw = InputSignal->get(); |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, |
| CopySize, 1, &InputSignalRaw, |
| OutputSignal->get()); |
| } |
| |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent, |
| CopySize, 0, nullptr, OutputSignal->get()); |
| } |
| |
| /// Push an asynchronous memory copy device-to-host involving an unpinned |
| /// memory buffer. The operation consists of a two-step copy from the |
| /// device buffer to an intermediate pinned host buffer, and then, to a |
| /// unpinned host buffer. Both operations are asynchronous and dependant. |
| /// The intermediate pinned buffer will be released to the specified memory |
| /// manager once the operation completes. |
| Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter, |
| uint64_t CopySize, |
| AMDGPUMemoryManagerTy &MemoryManager) { |
| // Retrieve available signals for the operation's outputs. |
| AMDGPUSignalTy *OutputSignals[2] = {}; |
| if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
| return Err; |
| for (auto *Signal : OutputSignals) { |
| Signal->reset(); |
| Signal->increaseUseCount(); |
| } |
| |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignals[0]); |
| |
| // Avoid defining the input dependency if already satisfied. |
| if (InputSignal && !InputSignal->load()) |
| InputSignal = nullptr; |
| |
| // Setup the post action for releasing the intermediate buffer. |
| if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager)) |
| return Err; |
| |
| // Issue the first step: device to host transfer. Avoid defining the input |
| // dependency if already satisfied. |
| if (InputSignal) { |
| hsa_signal_t InputSignalRaw = InputSignal->get(); |
| if (auto Err = utils::asyncMemCopy( |
| UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1, |
| &InputSignalRaw, OutputSignals[0]->get())) |
| return Err; |
| } else { |
| if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent, |
| Src, Agent, CopySize, 0, nullptr, |
| OutputSignals[0]->get())) |
| return Err; |
| } |
| |
| // Consume another stream slot and compute dependencies. |
| std::tie(Curr, InputSignal) = consume(OutputSignals[1]); |
| assert(InputSignal && "Invalid input signal"); |
| |
| // The std::memcpy is done asynchronously using an async handler. We store |
| // the function's information in the action but it's not actually an action. |
| if (auto Err = Slots[Curr].schedHostMemoryCopy(Dst, Inter, CopySize)) |
| return Err; |
| |
| // Make changes on this slot visible to the async handler's thread. |
| std::atomic_thread_fence(std::memory_order_release); |
| |
| // Issue the second step: host to host transfer. |
| hsa_status_t Status = hsa_amd_signal_async_handler( |
| InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
| (void *)&Slots[Curr]); |
| |
| return Plugin::check(Status, "Error in hsa_amd_signal_async_handler: %s"); |
| } |
| |
| /// Push an asynchronous memory copy host-to-device involving an unpinned |
| /// memory buffer. The operation consists of a two-step copy from the |
| /// unpinned host buffer to an intermediate pinned host buffer, and then, to |
| /// the pinned host buffer. Both operations are asynchronous and dependant. |
| /// The intermediate pinned buffer will be released to the specified memory |
| /// manager once the operation completes. |
| Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter, |
| uint64_t CopySize, |
| AMDGPUMemoryManagerTy &MemoryManager) { |
| // Retrieve available signals for the operation's outputs. |
| AMDGPUSignalTy *OutputSignals[2] = {}; |
| if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals)) |
| return Err; |
| for (auto *Signal : OutputSignals) { |
| Signal->reset(); |
| Signal->increaseUseCount(); |
| } |
| |
| AMDGPUSignalTy *OutputSignal = OutputSignals[0]; |
| |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignal); |
| |
| // Avoid defining the input dependency if already satisfied. |
| if (InputSignal && !InputSignal->load()) |
| InputSignal = nullptr; |
| |
| // Issue the first step: host to host transfer. |
| if (InputSignal) { |
| // The std::memcpy is done asynchronously using an async handler. We store |
| // the function's information in the action but it is not actually a |
| // post action. |
| if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize)) |
| return Err; |
| |
| // Make changes on this slot visible to the async handler's thread. |
| std::atomic_thread_fence(std::memory_order_release); |
| |
| hsa_status_t Status = hsa_amd_signal_async_handler( |
| InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback, |
| (void *)&Slots[Curr]); |
| |
| if (auto Err = Plugin::check(Status, |
| "Error in hsa_amd_signal_async_handler: %s")) |
| return Err; |
| |
| // Let's use now the second output signal. |
| OutputSignal = OutputSignals[1]; |
| |
| // Consume another stream slot and compute dependencies. |
| std::tie(Curr, InputSignal) = consume(OutputSignal); |
| } else { |
| // All preceding operations completed, copy the memory synchronously. |
| std::memcpy(Inter, Src, CopySize); |
| |
| // Return the second signal because it will not be used. |
| OutputSignals[1]->decreaseUseCount(); |
| if (auto Err = SignalManager.returnResource(OutputSignals[1])) |
| return Err; |
| } |
| |
| // Setup the post action to release the intermediate pinned buffer. |
| if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager)) |
| return Err; |
| |
| // Issue the second step: host to device transfer. Avoid defining the input |
| // dependency if already satisfied. |
| if (InputSignal && InputSignal->load()) { |
| hsa_signal_t InputSignalRaw = InputSignal->get(); |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, |
| Agent, CopySize, 1, &InputSignalRaw, |
| OutputSignal->get()); |
| } |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent, |
| CopySize, 0, nullptr, OutputSignal->get()); |
| } |
| |
| // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead |
| Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src, |
| hsa_agent_t SrcAgent, uint64_t CopySize) { |
| AMDGPUSignalTy *OutputSignal; |
| if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal)) |
| return Err; |
| OutputSignal->reset(); |
| OutputSignal->increaseUseCount(); |
| |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // Consume stream slot and compute dependencies. |
| auto [Curr, InputSignal] = consume(OutputSignal); |
| |
| // Avoid defining the input dependency if already satisfied. |
| if (InputSignal && !InputSignal->load()) |
| InputSignal = nullptr; |
| |
| // The agents need to have access to the corresponding memory |
| // This is presently only true if the pointers were originally |
| // allocated by this runtime or the caller made the appropriate |
| // access calls. |
| |
| if (InputSignal && InputSignal->load()) { |
| hsa_signal_t InputSignalRaw = InputSignal->get(); |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
| SrcAgent, CopySize, 1, &InputSignalRaw, |
| OutputSignal->get()); |
| } |
| return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src, |
| SrcAgent, CopySize, 0, nullptr, |
| OutputSignal->get()); |
| } |
| |
| /// Synchronize with the stream. The current thread waits until all operations |
| /// are finalized and it performs the pending post actions (i.e., releasing |
| /// intermediate buffers). |
| Error synchronize() { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // No need to synchronize anything. |
| if (size() == 0) |
| return Plugin::success(); |
| |
| // Wait until all previous operations on the stream have completed. |
| if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, |
| RPCServer, &Device)) |
| return Err; |
| |
| // Reset the stream and perform all pending post actions. |
| return complete(); |
| } |
| |
| /// Query the stream and complete pending post actions if operations finished. |
| /// Return whether all the operations completed. This operation does not block |
| /// the calling thread. |
| Expected<bool> query() { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // No need to query anything. |
| if (size() == 0) |
| return true; |
| |
| // The last operation did not complete yet. Return directly. |
| if (Slots[last()].Signal->load()) |
| return false; |
| |
| // Reset the stream and perform all pending post actions. |
| if (auto Err = complete()) |
| return std::move(Err); |
| |
| return true; |
| } |
| |
| /// Record the state of the stream on an event. |
| Error recordEvent(AMDGPUEventTy &Event) const; |
| |
| /// Make the stream wait on an event. |
| Error waitEvent(const AMDGPUEventTy &Event); |
| |
| friend struct AMDGPUStreamManagerTy; |
| }; |
| |
| /// Class representing an event on AMDGPU. The event basically stores some |
| /// information regarding the state of the recorded stream. |
| struct AMDGPUEventTy { |
| /// Create an empty event. |
| AMDGPUEventTy(AMDGPUDeviceTy &Device) |
| : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {} |
| |
| /// Initialize and deinitialize. |
| Error init() { return Plugin::success(); } |
| Error deinit() { return Plugin::success(); } |
| |
| /// Record the state of a stream on the event. |
| Error record(AMDGPUStreamTy &Stream) { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| // Ignore the last recorded stream. |
| RecordedStream = &Stream; |
| |
| return Stream.recordEvent(*this); |
| } |
| |
| /// Make a stream wait on the current event. |
| Error wait(AMDGPUStreamTy &Stream) { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| if (!RecordedStream) |
| return Plugin::error("Event does not have any recorded stream"); |
| |
| // Synchronizing the same stream. Do nothing. |
| if (RecordedStream == &Stream) |
| return Plugin::success(); |
| |
| // No need to wait anything, the recorded stream already finished the |
| // corresponding operation. |
| if (RecordedSlot < 0) |
| return Plugin::success(); |
| |
| return Stream.waitEvent(*this); |
| } |
| |
| protected: |
| /// The stream registered in this event. |
| AMDGPUStreamTy *RecordedStream; |
| |
| /// The recordered operation on the recorded stream. |
| int64_t RecordedSlot; |
| |
| /// The sync cycle when the stream was recorded. Used to detect stale events. |
| int64_t RecordedSyncCycle; |
| |
| /// Mutex to safely access event fields. |
| mutable std::mutex Mutex; |
| |
| friend struct AMDGPUStreamTy; |
| }; |
| |
| Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const { |
| std::lock_guard<std::mutex> Lock(Mutex); |
| |
| if (size() > 0) { |
| // Record the synchronize identifier (to detect stale recordings) and |
| // the last valid stream's operation. |
| Event.RecordedSyncCycle = SyncCycle; |
| Event.RecordedSlot = last(); |
| |
| assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle"); |
| assert(Event.RecordedSlot >= 0 && "Invalid recorded slot"); |
| } else { |
| // The stream is empty, everything already completed, record nothing. |
| Event.RecordedSyncCycle = -1; |
| Event.RecordedSlot = -1; |
| } |
| return Plugin::success(); |
| } |
| |
| Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { |
| // Retrieve the recorded stream on the event. |
| AMDGPUStreamTy &RecordedStream = *Event.RecordedStream; |
| |
| std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, RecordedStream.Mutex); |
| |
| // The recorded stream already completed the operation because the synchronize |
| // identifier is already outdated. |
| if (RecordedStream.SyncCycle != (uint32_t)Event.RecordedSyncCycle) |
| return Plugin::success(); |
| |
| // Again, the recorded stream already completed the operation, the last |
| // operation's output signal is satisfied. |
| if (!RecordedStream.Slots[Event.RecordedSlot].Signal->load()) |
| return Plugin::success(); |
| |
| // Otherwise, make the current stream wait on the other stream's operation. |
| return waitOnStreamOperation(RecordedStream, Event.RecordedSlot); |
| } |
| |
| struct AMDGPUStreamManagerTy final |
| : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> { |
| using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>; |
| using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>; |
| |
| AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) |
| : GenericDeviceResourceManagerTy(Device), |
| OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), |
| NextQueue(0), Agent(HSAAgent) {} |
| |
| Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { |
| Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues); |
| QueueSize = HSAQueueSize; |
| MaxNumQueues = NumHSAQueues; |
| // Initialize one queue eagerly |
| if (auto Err = Queues.front().init(Agent, QueueSize)) |
| return Err; |
| |
| return GenericDeviceResourceManagerTy::init(InitialSize); |
| } |
| |
| /// Deinitialize the resource pool and delete all resources. This function |
| /// must be called before the destructor. |
| Error deinit() override { |
| // De-init all queues |
| for (AMDGPUQueueTy &Queue : Queues) { |
| if (auto Err = Queue.deinit()) |
| return Err; |
| } |
| |
| return GenericDeviceResourceManagerTy::deinit(); |
| } |
| |
| /// Get a single stream from the pool or create new resources. |
| virtual Error getResource(AMDGPUStreamTy *&StreamHandle) override { |
| return getResourcesImpl(1, &StreamHandle, [this](AMDGPUStreamTy *&Handle) { |
| return assignNextQueue(Handle); |
| }); |
| } |
| |
| /// Return stream to the pool. |
| virtual Error returnResource(AMDGPUStreamTy *StreamHandle) override { |
| return returnResourceImpl(StreamHandle, [](AMDGPUStreamTy *Handle) { |
| Handle->Queue->removeUser(); |
| return Plugin::success(); |
| }); |
| } |
| |
| private: |
| /// Search for and assign an prefereably idle queue to the given Stream. If |
| /// there is no queue without current users, choose the queue with the lowest |
| /// user count. If utilization is ignored: use round robin selection. |
| inline Error assignNextQueue(AMDGPUStreamTy *Stream) { |
| // Start from zero when tracking utilization, otherwise: round robin policy. |
| uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; |
| |
| if (OMPX_QueueTracking) { |
| // Find the least used queue. |
| for (uint32_t I = 0; I < MaxNumQueues; ++I) { |
| // Early exit when an initialized queue is idle. |
| if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { |
| Index = I; |
| break; |
| } |
| |
| // Update the least used queue. |
| if (Queues[Index].getUserCount() > Queues[I].getUserCount()) |
| Index = I; |
| } |
| } |
| |
| // Make sure the queue is initialized, then add user & assign. |
| if (auto Err = Queues[Index].init(Agent, QueueSize)) |
| return Err; |
| Queues[Index].addUser(); |
| Stream->Queue = &Queues[Index]; |
| |
| return Plugin::success(); |
| } |
| |
| /// Envar for controlling the tracking of busy HSA queues. |
| BoolEnvar OMPX_QueueTracking; |
| |
| /// The next queue index to use for round robin selection. |
| uint32_t NextQueue; |
| |
| /// The queues which are assigned to requested streams. |
| std::vector<AMDGPUQueueTy> Queues; |
| |
| /// The corresponding device as HSA agent. |
| hsa_agent_t Agent; |
| |
| /// The maximum number of queues. |
| int MaxNumQueues; |
| |
| /// The size of created queues. |
| int QueueSize; |
| }; |
| |
| /// Abstract class that holds the common members of the actual kernel devices |
| /// and the host device. Both types should inherit from this class. |
| struct AMDGenericDeviceTy { |
| AMDGenericDeviceTy() {} |
| |
| virtual ~AMDGenericDeviceTy() {} |
| |
| /// Create all memory pools which the device has access to and classify them. |
| Error initMemoryPools() { |
| // Retrieve all memory pools from the device agent(s). |
| Error Err = retrieveAllMemoryPools(); |
| if (Err) |
| return Err; |
| |
| for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { |
| // Initialize the memory pool and retrieve some basic info. |
| Error Err = MemoryPool->init(); |
| if (Err) |
| return Err; |
| |
| if (!MemoryPool->isGlobal()) |
| continue; |
| |
| // Classify the memory pools depending on their properties. |
| if (MemoryPool->isFineGrained()) { |
| FineGrainedMemoryPools.push_back(MemoryPool); |
| if (MemoryPool->supportsKernelArgs()) |
| ArgsMemoryPools.push_back(MemoryPool); |
| } else if (MemoryPool->isCoarseGrained()) { |
| CoarseGrainedMemoryPools.push_back(MemoryPool); |
| } |
| } |
| return Plugin::success(); |
| } |
| |
| /// Destroy all memory pools. |
| Error deinitMemoryPools() { |
| for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) |
| delete Pool; |
| |
| AllMemoryPools.clear(); |
| FineGrainedMemoryPools.clear(); |
| CoarseGrainedMemoryPools.clear(); |
| ArgsMemoryPools.clear(); |
| |
| return Plugin::success(); |
| } |
| |
| /// Retrieve and construct all memory pools from the device agent(s). |
| virtual Error retrieveAllMemoryPools() = 0; |
| |
| /// Get the device agent. |
| virtual hsa_agent_t getAgent() const = 0; |
| |
| protected: |
| /// Array of all memory pools available to the host agents. |
| llvm::SmallVector<AMDGPUMemoryPoolTy *> AllMemoryPools; |
| |
| /// Array of fine-grained memory pools available to the host agents. |
| llvm::SmallVector<AMDGPUMemoryPoolTy *> FineGrainedMemoryPools; |
| |
| /// Array of coarse-grained memory pools available to the host agents. |
| llvm::SmallVector<AMDGPUMemoryPoolTy *> CoarseGrainedMemoryPools; |
| |
| /// Array of kernel args memory pools available to the host agents. |
| llvm::SmallVector<AMDGPUMemoryPoolTy *> ArgsMemoryPools; |
| }; |
| |
| /// Class representing the host device. This host device may have more than one |
| /// HSA host agent. We aggregate all its resources into the same instance. |
| struct AMDHostDeviceTy : public AMDGenericDeviceTy { |
| /// Create a host device from an array of host agents. |
| AMDHostDeviceTy(const llvm::SmallVector<hsa_agent_t> &HostAgents) |
| : AMDGenericDeviceTy(), Agents(HostAgents), ArgsMemoryManager(), |
| PinnedMemoryManager() { |
| assert(HostAgents.size() && "No host agent found"); |
| } |
| |
| /// Initialize the host device memory pools and the memory managers for |
| /// kernel args and host pinned memory allocations. |
| Error init() { |
| if (auto Err = initMemoryPools()) |
| return Err; |
| |
| if (auto Err = ArgsMemoryManager.init(getArgsMemoryPool())) |
| return Err; |
| |
| if (auto Err = PinnedMemoryManager.init(getFineGrainedMemoryPool())) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Deinitialize memory pools and managers. |
| Error deinit() { |
| if (auto Err = deinitMemoryPools()) |
| return Err; |
| |
| if (auto Err = ArgsMemoryManager.deinit()) |
| return Err; |
| |
| if (auto Err = PinnedMemoryManager.deinit()) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Retrieve and construct all memory pools from the host agents. |
| Error retrieveAllMemoryPools() override { |
| // Iterate through the available pools across the host agents. |
| for (hsa_agent_t Agent : Agents) { |
| Error Err = utils::iterateAgentMemoryPools( |
| Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
| AMDGPUMemoryPoolTy *MemoryPool = |
| new AMDGPUMemoryPoolTy(HSAMemoryPool); |
| AllMemoryPools.push_back(MemoryPool); |
| return HSA_STATUS_SUCCESS; |
| }); |
| if (Err) |
| return Err; |
| } |
| return Plugin::success(); |
| } |
| |
| /// Get one of the host agents. Return always the first agent. |
| hsa_agent_t getAgent() const override { return Agents[0]; } |
| |
| /// Get a memory pool for fine-grained allocations. |
| AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() { |
| assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool"); |
| // Retrive any memory pool. |
| return *FineGrainedMemoryPools[0]; |
| } |
| |
| AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() { |
| assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool"); |
| // Retrive any memory pool. |
| return *CoarseGrainedMemoryPools[0]; |
| } |
| |
| /// Get a memory pool for kernel args allocations. |
| AMDGPUMemoryPoolTy &getArgsMemoryPool() { |
| assert(!ArgsMemoryPools.empty() && "No kernelargs mempool"); |
| // Retrieve any memory pool. |
| return *ArgsMemoryPools[0]; |
| } |
| |
| /// Getters for kernel args and host pinned memory managers. |
| AMDGPUMemoryManagerTy &getArgsMemoryManager() { return ArgsMemoryManager; } |
| AMDGPUMemoryManagerTy &getPinnedMemoryManager() { |
| return PinnedMemoryManager; |
| } |
| |
| private: |
| /// Array of agents on the host side. |
| const llvm::SmallVector<hsa_agent_t> Agents; |
| |
| // Memory manager for kernel arguments. |
| AMDGPUMemoryManagerTy ArgsMemoryManager; |
| |
| // Memory manager for pinned memory. |
| AMDGPUMemoryManagerTy PinnedMemoryManager; |
| }; |
| |
| /// Class implementing the AMDGPU device functionalities which derives from the |
| /// generic device class. |
| struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { |
| // Create an AMDGPU device with a device id and default AMDGPU grid values. |
| AMDGPUDeviceTy(int32_t DeviceId, int32_t NumDevices, |
| AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) |
| : GenericDeviceTy(DeviceId, NumDevices, {0}), AMDGenericDeviceTy(), |
| OMPX_NumQueues("LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES", 4), |
| OMPX_QueueSize("LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE", 512), |
| OMPX_DefaultTeamsPerCU("LIBOMPTARGET_AMDGPU_TEAMS_PER_CU", 4), |
| OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES", |
| 1 * 1024 * 1024), // 1MB |
| OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", |
| 64), |
| OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), |
| OMPX_UseMultipleSdmaEngines( |
| "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false), |
| AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this), |
| AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {} |
| |
| ~AMDGPUDeviceTy() {} |
| |
| /// Initialize the device, its resources and get its properties. |
| Error initImpl(GenericPluginTy &Plugin) override { |
| // First setup all the memory pools. |
| if (auto Err = initMemoryPools()) |
| return Err; |
| |
| char GPUName[64]; |
| if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) |
| return Err; |
| ComputeUnitKind = GPUName; |
| |
| // Get the wavefront size. |
| uint32_t WavefrontSize = 0; |
| if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) |
| return Err; |
| GridValues.GV_Warp_Size = WavefrontSize; |
| |
| // Get the frequency of the steady clock. If the attribute is missing |
| // assume running on an older libhsa and default to 0, omp_get_wtime |
| // will be inaccurate but otherwise programs can still run. |
| if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, |
| ClockFrequency)) |
| ClockFrequency = 0; |
| |
| // Load the grid values dependending on the wavefront. |
| if (WavefrontSize == 32) |
| GridValues = getAMDGPUGridValues<32>(); |
| else if (WavefrontSize == 64) |
| GridValues = getAMDGPUGridValues<64>(); |
| else |
| return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize); |
| |
| // Get maximum number of workitems per workgroup. |
| uint16_t WorkgroupMaxDim[3]; |
| if (auto Err = |
| getDeviceAttr(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim)) |
| return Err; |
| GridValues.GV_Max_WG_Size = WorkgroupMaxDim[0]; |
| |
| // Get maximum number of workgroups. |
| hsa_dim3_t GridMaxDim; |
| if (auto Err = getDeviceAttr(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim)) |
| return Err; |
| |
| GridValues.GV_Max_Teams = GridMaxDim.x / GridValues.GV_Max_WG_Size; |
| if (GridValues.GV_Max_Teams == 0) |
| return Plugin::error("Maximum number of teams cannot be zero"); |
| |
| // Compute the default number of teams. |
| uint32_t ComputeUnits = 0; |
| if (auto Err = |
| getDeviceAttr(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, ComputeUnits)) |
| return Err; |
| GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU; |
| |
| uint32_t WavesPerCU = 0; |
| if (auto Err = |
| getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU)) |
| return Err; |
| HardwareParallelism = ComputeUnits * WavesPerCU; |
| |
| // Get maximum size of any device queues and maximum number of queues. |
| uint32_t MaxQueueSize; |
| if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize)) |
| return Err; |
| |
| uint32_t MaxQueues; |
| if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUES_MAX, MaxQueues)) |
| return Err; |
| |
| // Compute the number of queues and their size. |
| OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); |
| OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); |
| |
| // Initialize stream pool. |
| if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams, |
| OMPX_NumQueues, OMPX_QueueSize)) |
| return Err; |
| |
| // Initialize event pool. |
| if (auto Err = AMDGPUEventManager.init(OMPX_InitialNumEvents)) |
| return Err; |
| |
| // Initialize signal pool. |
| if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Deinitialize the device and release its resources. |
| Error deinitImpl() override { |
| // Deinitialize the stream and event pools. |
| if (auto Err = AMDGPUStreamManager.deinit()) |
| return Err; |
| |
| if (auto Err = AMDGPUEventManager.deinit()) |
| return Err; |
| |
| if (auto Err = AMDGPUSignalManager.deinit()) |
| return Err; |
| |
| // Close modules if necessary. |
| if (!LoadedImages.empty()) { |
| // Each image has its own module. |
| for (DeviceImageTy *Image : LoadedImages) { |
| AMDGPUDeviceImageTy &AMDImage = |
| static_cast<AMDGPUDeviceImageTy &>(*Image); |
| |
| // Unload the executable of the image. |
| if (auto Err = AMDImage.unloadExecutable()) |
| return Err; |
| } |
| } |
| |
| // Invalidate agent reference. |
| Agent = {0}; |
| |
| return Plugin::success(); |
| } |
| |
| virtual Error callGlobalConstructors(GenericPluginTy &Plugin, |
| DeviceImageTy &Image) override { |
| return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.init"); |
| } |
| |
| virtual Error callGlobalDestructors(GenericPluginTy &Plugin, |
| DeviceImageTy &Image) override { |
| return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.fini"); |
| } |
| |
| const uint64_t getStreamBusyWaitMicroseconds() const { |
| return OMPX_StreamBusyWait; |
| } |
| |
| Expected<std::unique_ptr<MemoryBuffer>> |
| doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { |
| |
| // TODO: We should try to avoid materialization but there seems to be no |
| // good linker interface w/o file i/o. |
| SmallString<128> LinkerOutputFilePath; |
| std::error_code EC = sys::fs::createTemporaryFile( |
| "amdgpu-pre-link-jit", ".out", LinkerOutputFilePath); |
| if (EC) |
| return createStringError(EC, |
| "Failed to create temporary file for linker"); |
| |
| SmallString<128> LinkerInputFilePath = LinkerOutputFilePath; |
| LinkerInputFilePath.pop_back_n(2); |
| |
| auto FD = raw_fd_ostream(LinkerInputFilePath.data(), EC); |
| if (EC) |
| return createStringError(EC, "Failed to open temporary file for linker"); |
| FD.write(MB->getBufferStart(), MB->getBufferSize()); |
| FD.close(); |
| |
| const auto &ErrorOrPath = sys::findProgramByName("lld"); |
| if (!ErrorOrPath) |
| return createStringError(inconvertibleErrorCode(), |
| "Failed to find `lld` on the PATH."); |
| |
| std::string LLDPath = ErrorOrPath.get(); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), |
| "Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str()); |
| |
| std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); |
| |
| StringRef Args[] = {LLDPath, |
| "-flavor", |
| "gnu", |
| "--no-undefined", |
| "-shared", |
| MCPU, |
| "-o", |
| LinkerOutputFilePath.data(), |
| LinkerInputFilePath.data()}; |
| |
| std::string Error; |
| int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); |
| if (RC) |
| return createStringError(inconvertibleErrorCode(), |
| "Linking optimized bitcode failed: %s", |
| Error.c_str()); |
| |
| return std::move( |
| MemoryBuffer::getFileOrSTDIN(LinkerOutputFilePath.data()).get()); |
| } |
| |
| /// See GenericDeviceTy::getComputeUnitKind(). |
| std::string getComputeUnitKind() const override { return ComputeUnitKind; } |
| |
| /// Returns the clock frequency for the given AMDGPU device. |
| uint64_t getClockFrequency() const override { return ClockFrequency; } |
| |
| /// Allocate and construct an AMDGPU kernel. |
| Expected<GenericKernelTy &> |
| constructKernel(const __tgt_offload_entry &KernelEntry) override { |
| // Allocate and construct the AMDGPU kernel. |
| AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>(); |
| if (!AMDGPUKernel) |
| return Plugin::error("Failed to allocate memory for AMDGPU kernel"); |
| |
| new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name); |
| |
| return *AMDGPUKernel; |
| } |
| |
| /// Set the current context to this device's context. Do nothing since the |
| /// AMDGPU devices do not have the concept of contexts. |
| Error setContext() override { return Plugin::success(); } |
| |
| /// AMDGPU returns the product of the number of compute units and the waves |
| /// per compute unit. |
| uint64_t getHardwareParallelism() const override { |
| return HardwareParallelism; |
| } |
| |
| /// We want to set up the RPC server for host services to the GPU if it is |
| /// availible. |
| bool shouldSetupRPCServer() const override { |
| return libomptargetSupportsRPC(); |
| } |
| |
| /// The RPC interface should have enough space for all availible parallelism. |
| uint64_t requestedRPCPortCount() const override { |
| return getHardwareParallelism(); |
| } |
| |
| /// Get the stream of the asynchronous info sructure or get a new one. |
| Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, |
| AMDGPUStreamTy *&Stream) { |
| // Get the stream (if any) from the async info. |
| Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>(); |
| if (!Stream) { |
| // There was no stream; get an idle one. |
| if (auto Err = AMDGPUStreamManager.getResource(Stream)) |
| return Err; |
| |
| // Modify the async info's stream. |
| AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream); |
| } |
| return Plugin::success(); |
| } |
| |
| /// Load the binary image into the device and allocate an image object. |
| Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, |
| int32_t ImageId) override { |
| // Allocate and initialize the image object. |
| AMDGPUDeviceImageTy *AMDImage = |
| Plugin::get().allocate<AMDGPUDeviceImageTy>(); |
| new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage); |
| |
| // Load the HSA executable. |
| if (Error Err = AMDImage->loadExecutable(*this)) |
| return std::move(Err); |
| |
| return AMDImage; |
| } |
| |
| /// Allocate memory on the device or related to the device. |
| void *allocate(size_t Size, void *, TargetAllocTy Kind) override; |
| |
| /// Deallocate memory on the device or related to the device. |
| int free(void *TgtPtr, TargetAllocTy Kind) override { |
| if (TgtPtr == nullptr) |
| return OFFLOAD_SUCCESS; |
| |
| AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
| MemoryPool = CoarseGrainedMemoryPools[0]; |
| break; |
| case TARGET_ALLOC_HOST: |
| MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| break; |
| case TARGET_ALLOC_SHARED: |
| MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| break; |
| } |
| |
| if (!MemoryPool) { |
| REPORT("No memory pool for the specified allocation kind\n"); |
| return OFFLOAD_FAIL; |
| } |
| |
| if (Error Err = MemoryPool->deallocate(TgtPtr)) { |
| REPORT("%s\n", toString(std::move(Err)).data()); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| /// Synchronize current thread with the pending operations on the async info. |
| Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { |
| AMDGPUStreamTy *Stream = |
| reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
| assert(Stream && "Invalid stream"); |
| |
| if (auto Err = Stream->synchronize()) |
| return Err; |
| |
| // Once the stream is synchronized, return it to stream pool and reset |
| // AsyncInfo. This is to make sure the synchronization only works for its |
| // own tasks. |
| AsyncInfo.Queue = nullptr; |
| return AMDGPUStreamManager.returnResource(Stream); |
| } |
| |
| /// Query for the completion of the pending operations on the async info. |
| Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { |
| AMDGPUStreamTy *Stream = |
| reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue); |
| assert(Stream && "Invalid stream"); |
| |
| auto CompletedOrErr = Stream->query(); |
| if (!CompletedOrErr) |
| return CompletedOrErr.takeError(); |
| |
| // Return if it the stream did not complete yet. |
| if (!(*CompletedOrErr)) |
| return Plugin::success(); |
| |
| // Once the stream is completed, return it to stream pool and reset |
| // AsyncInfo. This is to make sure the synchronization only works for its |
| // own tasks. |
| AsyncInfo.Queue = nullptr; |
| return AMDGPUStreamManager.returnResource(Stream); |
| } |
| |
| /// Pin the host buffer and return the device pointer that should be used for |
| /// device transfers. |
| Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { |
| void *PinnedPtr = nullptr; |
| |
| hsa_status_t Status = |
| hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); |
| if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) |
| return std::move(Err); |
| |
| return PinnedPtr; |
| } |
| |
| /// Unpin the host buffer. |
| Error dataUnlockImpl(void *HstPtr) override { |
| hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); |
| return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); |
| } |
| |
| /// Check through the HSA runtime whether the \p HstPtr buffer is pinned. |
| Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, |
| void *&BaseDevAccessiblePtr, |
| size_t &BaseSize) const override { |
| hsa_amd_pointer_info_t Info; |
| Info.size = sizeof(hsa_amd_pointer_info_t); |
| |
| hsa_status_t Status = |
| hsa_amd_pointer_info(HstPtr, &Info, /* Allocator */ nullptr, |
| /* Number of accessible agents (out) */ nullptr, |
| /* Accessible agents */ nullptr); |
| if (auto Err = Plugin::check(Status, "Error in hsa_amd_pointer_info: %s")) |
| return std::move(Err); |
| |
| // The buffer may be locked or allocated through HSA allocators. Assume that |
| // the buffer is host pinned if the runtime reports a HSA type. |
| if (Info.type != HSA_EXT_POINTER_TYPE_LOCKED && |
| Info.type != HSA_EXT_POINTER_TYPE_HSA) |
| return false; |
| |
| assert(Info.hostBaseAddress && "Invalid host pinned address"); |
| assert(Info.agentBaseAddress && "Invalid agent pinned address"); |
| assert(Info.sizeInBytes > 0 && "Invalid pinned allocation size"); |
| |
| // Save the allocation info in the output parameters. |
| BaseHstPtr = Info.hostBaseAddress; |
| BaseDevAccessiblePtr = Info.agentBaseAddress; |
| BaseSize = Info.sizeInBytes; |
| |
| return true; |
| } |
| |
| /// Submit data to the device (host to device transfer). |
| Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| AMDGPUStreamTy *Stream = nullptr; |
| void *PinnedPtr = nullptr; |
| |
| // Use one-step asynchronous operation when host memory is already pinned. |
| if (void *PinnedPtr = |
| PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size); |
| } |
| |
| // For large transfers use synchronous behavior. |
| if (Size >= OMPX_MaxAsyncCopyBytes) { |
| if (AsyncInfoWrapper.hasQueue()) |
| if (auto Err = synchronize(AsyncInfoWrapper)) |
| return Err; |
| |
| hsa_status_t Status; |
| Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
| &PinnedPtr); |
| if (auto Err = |
| Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) |
| return Err; |
| |
| AMDGPUSignalTy Signal; |
| if (auto Err = Signal.init()) |
| return Err; |
| |
| if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr, |
| Agent, PinnedPtr, Agent, Size, 0, |
| nullptr, Signal.get())) |
| return Err; |
| |
| if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
| return Err; |
| |
| if (auto Err = Signal.deinit()) |
| return Err; |
| |
| Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
| return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); |
| } |
| |
| // Otherwise, use two-step copy with an intermediate pinned host buffer. |
| AMDGPUMemoryManagerTy &PinnedMemoryManager = |
| HostDevice.getPinnedMemoryManager(); |
| if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) |
| return Err; |
| |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size, |
| PinnedMemoryManager); |
| } |
| |
| /// Retrieve data from the device (device to host transfer). |
| Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| AMDGPUStreamTy *Stream = nullptr; |
| void *PinnedPtr = nullptr; |
| |
| // Use one-step asynchronous operation when host memory is already pinned. |
| if (void *PinnedPtr = |
| PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size); |
| } |
| |
| // For large transfers use synchronous behavior. |
| if (Size >= OMPX_MaxAsyncCopyBytes) { |
| if (AsyncInfoWrapper.hasQueue()) |
| if (auto Err = synchronize(AsyncInfoWrapper)) |
| return Err; |
| |
| hsa_status_t Status; |
| Status = hsa_amd_memory_lock(const_cast<void *>(HstPtr), Size, nullptr, 0, |
| &PinnedPtr); |
| if (auto Err = |
| Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) |
| return Err; |
| |
| AMDGPUSignalTy Signal; |
| if (auto Err = Signal.init()) |
| return Err; |
| |
| if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr, |
| Agent, TgtPtr, Agent, Size, 0, nullptr, |
| Signal.get())) |
| return Err; |
| |
| if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) |
| return Err; |
| |
| if (auto Err = Signal.deinit()) |
| return Err; |
| |
| Status = hsa_amd_memory_unlock(const_cast<void *>(HstPtr)); |
| return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); |
| } |
| |
| // Otherwise, use two-step copy with an intermediate pinned host buffer. |
| AMDGPUMemoryManagerTy &PinnedMemoryManager = |
| HostDevice.getPinnedMemoryManager(); |
| if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) |
| return Err; |
| |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size, |
| PinnedMemoryManager); |
| } |
| |
| /// Exchange data between two devices within the plugin. |
| Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, |
| void *DstPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice); |
| |
| AMDGPUStreamTy *Stream = nullptr; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| if (Size <= 0) |
| return Plugin::success(); |
| |
| return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr, |
| getAgent(), (uint64_t)Size); |
| } |
| |
| /// Initialize the async info for interoperability purposes. |
| Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| // TODO: Implement this function. |
| return Plugin::success(); |
| } |
| |
| /// Initialize the device info for interoperability purposes. |
| Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { |
| DeviceInfo->Context = nullptr; |
| |
| if (!DeviceInfo->Device) |
| DeviceInfo->Device = reinterpret_cast<void *>(Agent.handle); |
| |
| return Plugin::success(); |
| } |
| |
| /// Create an event. |
| Error createEventImpl(void **EventPtrStorage) override { |
| AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage); |
| return AMDGPUEventManager.getResource(*Event); |
| } |
| |
| /// Destroy a previously created event. |
| Error destroyEventImpl(void *EventPtr) override { |
| AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| return AMDGPUEventManager.returnResource(Event); |
| } |
| |
| /// Record the event. |
| Error recordEventImpl(void *EventPtr, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| assert(Event && "Invalid event"); |
| |
| AMDGPUStreamTy *Stream = nullptr; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Event->record(*Stream); |
| } |
| |
| /// Make the stream wait on the event. |
| Error waitEventImpl(void *EventPtr, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr); |
| |
| AMDGPUStreamTy *Stream = nullptr; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Event->wait(*Stream); |
| } |
| |
| /// Synchronize the current thread with the event. |
| Error syncEventImpl(void *EventPtr) override { |
| return Plugin::error("Synchronize event not implemented"); |
| } |
| |
| /// Print information about the device. |
| Error obtainInfoImpl(InfoQueueTy &Info) override { |
| char TmpChar[1000]; |
| const char *TmpCharPtr = "Unknown"; |
| uint16_t Major, Minor; |
| uint32_t TmpUInt, TmpUInt2; |
| uint32_t CacheSize[4]; |
| size_t TmpSt; |
| bool TmpBool; |
| uint16_t WorkgrpMaxDim[3]; |
| hsa_dim3_t GridMaxDim; |
| hsa_status_t Status, Status2; |
| |
| Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major); |
| Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); |
| if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) |
| Info.add("HSA Runtime Version", |
| std::to_string(Major) + "." + std::to_string(Minor)); |
| |
| Info.add("HSA OpenMP Device Number", DeviceId); |
| |
| Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Product Name", TmpChar); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Device Name", TmpChar); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Vendor Name", TmpChar); |
| |
| hsa_device_type_t DevType; |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); |
| if (Status == HSA_STATUS_SUCCESS) { |
| switch (DevType) { |
| case HSA_DEVICE_TYPE_CPU: |
| TmpCharPtr = "CPU"; |
| break; |
| case HSA_DEVICE_TYPE_GPU: |
| TmpCharPtr = "GPU"; |
| break; |
| case HSA_DEVICE_TYPE_DSP: |
| TmpCharPtr = "DSP"; |
| break; |
| } |
| Info.add("Device Type", TmpCharPtr); |
| } |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUES_MAX, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Max Queues", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MIN_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Queue Min Size", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_QUEUE_MAX_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Queue Max Size", TmpUInt); |
| |
| // FIXME: This is deprecated according to HSA documentation. But using |
| // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
| // runtime. |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize); |
| if (Status == HSA_STATUS_SUCCESS) { |
| Info.add("Cache"); |
| |
| for (int I = 0; I < 4; I++) |
| if (CacheSize[I]) |
| Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]); |
| } |
| |
| Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Cacheline Size", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Max Clock Freq", TmpUInt, "MHz"); |
| |
| Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Compute Units", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("SIMD per CU", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_FAST_F16_OPERATION, TmpBool); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Fast F16 Operation", TmpBool); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_WAVEFRONT_SIZE, TmpUInt2); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Wavefront Size", TmpUInt2); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Workgroup Max Size", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); |
| if (Status == HSA_STATUS_SUCCESS) { |
| Info.add("Workgroup Max Size per Dimension"); |
| Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]); |
| Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]); |
| Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]); |
| } |
| |
| Status = getDeviceAttrRaw( |
| (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) { |
| Info.add("Max Waves Per CU", TmpUInt); |
| Info.add("Max Work-item Per CU", TmpUInt * TmpUInt2); |
| } |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Grid Max Size", TmpUInt); |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim); |
| if (Status == HSA_STATUS_SUCCESS) { |
| Info.add("Grid Max Size per Dimension"); |
| Info.add<InfoLevel2>("x", GridMaxDim.x); |
| Info.add<InfoLevel2>("y", GridMaxDim.y); |
| Info.add<InfoLevel2>("z", GridMaxDim.z); |
| } |
| |
| Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add("Max fbarriers/Workgrp", TmpUInt); |
| |
| Info.add("Memory Pools"); |
| for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
| std::string TmpStr, TmpStr2; |
| |
| if (Pool->isGlobal()) |
| TmpStr = "Global"; |
| else if (Pool->isReadOnly()) |
| TmpStr = "ReadOnly"; |
| else if (Pool->isPrivate()) |
| TmpStr = "Private"; |
| else if (Pool->isGroup()) |
| TmpStr = "Group"; |
| else |
| TmpStr = "Unknown"; |
| |
| Info.add<InfoLevel2>(std::string("Pool ") + TmpStr); |
| |
| if (Pool->isGlobal()) { |
| if (Pool->isFineGrained()) |
| TmpStr2 += "Fine Grained "; |
| if (Pool->isCoarseGrained()) |
| TmpStr2 += "Coarse Grained "; |
| if (Pool->supportsKernelArgs()) |
| TmpStr2 += "Kernarg "; |
| |
| Info.add<InfoLevel3>("Flags", TmpStr2); |
| } |
| |
| Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel3>("Size", TmpSt, "bytes"); |
| |
| Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
| TmpBool); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel3>("Allocatable", TmpBool); |
| |
| Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, |
| TmpSt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes"); |
| |
| Status = Pool->getAttrRaw( |
| HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes"); |
| |
| Status = |
| Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel3>("Accessable by all", TmpBool); |
| } |
| |
| Info.add("ISAs"); |
| auto Err = utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) { |
| Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar); |
| if (Status == HSA_STATUS_SUCCESS) |
| Info.add<InfoLevel2>("Name", TmpChar); |
| |
| return Status; |
| }); |
| |
| // Silently consume the error. |
| if (Err) |
| consumeError(std::move(Err)); |
| |
| return Plugin::success(); |
| } |
| |
| /// Getters and setters for stack and heap sizes. |
| Error getDeviceStackSize(uint64_t &Value) override { |
| Value = StackSize; |
| return Plugin::success(); |
| } |
| Error setDeviceStackSize(uint64_t Value) override { |
| StackSize = Value; |
| return Plugin::success(); |
| } |
| Error getDeviceHeapSize(uint64_t &Value) override { |
| Value = DeviceMemoryPoolSize; |
| return Plugin::success(); |
| } |
| Error setDeviceHeapSize(uint64_t Value) override { |
| for (DeviceImageTy *Image : LoadedImages) |
| if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value)) |
| return Err; |
| DeviceMemoryPoolSize = Value; |
| return Plugin::success(); |
| } |
| Error getDeviceMemorySize(uint64_t &Value) override { |
| for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { |
| if (Pool->isGlobal()) { |
| hsa_status_t Status = |
| Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value); |
| return Plugin::check(Status, "Error in getting device memory size: %s"); |
| } |
| } |
| return Plugin::error("getDeviceMemorySize:: no global pool"); |
| } |
| |
| /// AMDGPU-specific function to get device attributes. |
| template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) { |
| hsa_status_t Status = |
| hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
| return Plugin::check(Status, "Error in hsa_agent_get_info: %s"); |
| } |
| |
| template <typename Ty> |
| hsa_status_t getDeviceAttrRaw(uint32_t Kind, Ty &Value) { |
| return hsa_agent_get_info(Agent, (hsa_agent_info_t)Kind, &Value); |
| } |
| |
| /// Get the device agent. |
| hsa_agent_t getAgent() const override { return Agent; } |
| |
| /// Get the signal manager. |
| AMDGPUSignalManagerTy &getSignalManager() { return AMDGPUSignalManager; } |
| |
| /// Retrieve and construct all memory pools of the device agent. |
| Error retrieveAllMemoryPools() override { |
| // Iterate through the available pools of the device agent. |
| return utils::iterateAgentMemoryPools( |
| Agent, [&](hsa_amd_memory_pool_t HSAMemoryPool) { |
| AMDGPUMemoryPoolTy *MemoryPool = |
| Plugin::get().allocate<AMDGPUMemoryPoolTy>(); |
| new (MemoryPool) AMDGPUMemoryPoolTy(HSAMemoryPool); |
| AllMemoryPools.push_back(MemoryPool); |
| return HSA_STATUS_SUCCESS; |
| }); |
| } |
| |
| bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; } |
| |
| private: |
| using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; |
| using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; |
| |
| /// Common method to invoke a single threaded constructor or destructor |
| /// kernel by name. |
| Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, |
| const char *Name) { |
| // Perform a quick check for the named kernel in the image. The kernel |
| // should be created by the 'amdgpu-lower-ctor-dtor' pass. |
| GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
| if (!Handler.isSymbolInImage(*this, Image, Name)) |
| return Plugin::success(); |
| |
| // Allocate and construct the AMDGPU kernel. |
| AMDGPUKernelTy AMDGPUKernel(Name); |
| if (auto Err = AMDGPUKernel.init(*this, Image)) |
| return Err; |
| |
| AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); |
| |
| KernelArgsTy KernelArgs = {}; |
| if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, |
| /*NumBlocks=*/1ul, KernelArgs, |
| /*Args=*/nullptr, AsyncInfoWrapper)) |
| return Err; |
| |
| Error Err = Plugin::success(); |
| AsyncInfoWrapper.finalize(Err); |
| |
| return Err; |
| } |
| |
| /// Envar for controlling the number of HSA queues per device. High number of |
| /// queues may degrade performance. |
| UInt32Envar OMPX_NumQueues; |
| |
| /// Envar for controlling the size of each HSA queue. The size is the number |
| /// of HSA packets a queue is expected to hold. It is also the number of HSA |
| /// packets that can be pushed into each queue without waiting the driver to |
| /// process them. |
| UInt32Envar OMPX_QueueSize; |
| |
| /// Envar for controlling the default number of teams relative to the number |
| /// of compute units (CUs) the device has: |
| /// #default_teams = OMPX_DefaultTeamsPerCU * #CUs. |
| UInt32Envar OMPX_DefaultTeamsPerCU; |
| |
| /// Envar specifying the maximum size in bytes where the memory copies are |
| /// asynchronous operations. Up to this transfer size, the memory copies are |
| /// asychronous operations pushed to the corresponding stream. For larger |
| /// transfers, they are synchronous transfers. |
| UInt32Envar OMPX_MaxAsyncCopyBytes; |
| |
| /// Envar controlling the initial number of HSA signals per device. There is |
| /// one manager of signals per device managing several pre-allocated signals. |
| /// These signals are mainly used by AMDGPU streams. If needed, more signals |
| /// will be created. |
| UInt32Envar OMPX_InitialNumSignals; |
| |
| /// Environment variables to set the time to wait in active state before |
| /// switching to blocked state. The default 2000000 busywaits for 2 seconds |
| /// before going into a blocking HSA wait state. The unit for these variables |
| /// are microseconds. |
| UInt32Envar OMPX_StreamBusyWait; |
| |
| /// Use ROCm 5.7 interface for multiple SDMA engines |
| BoolEnvar OMPX_UseMultipleSdmaEngines; |
| |
| /// Stream manager for AMDGPU streams. |
| AMDGPUStreamManagerTy AMDGPUStreamManager; |
| |
| /// Event manager for AMDGPU events. |
| AMDGPUEventManagerTy AMDGPUEventManager; |
| |
| /// Signal manager for AMDGPU signals. |
| AMDGPUSignalManagerTy AMDGPUSignalManager; |
| |
| /// The agent handler corresponding to the device. |
| hsa_agent_t Agent; |
| |
| /// The GPU architecture. |
| std::string ComputeUnitKind; |
| |
| /// The frequency of the steady clock inside the device. |
| uint64_t ClockFrequency; |
| |
| /// The total number of concurrent work items that can be running on the GPU. |
| uint64_t HardwareParallelism; |
| |
| /// Reference to the host device. |
| AMDHostDeviceTy &HostDevice; |
| |
| /// The current size of the global device memory pool (managed by us). |
| uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; |
| |
| /// The current size of the stack that will be used in cases where it could |
| /// not be statically determined. |
| uint64_t StackSize = 16 * 1024 /* 16 KB */; |
| }; |
| |
| Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { |
| hsa_status_t Status; |
| Status = hsa_code_object_deserialize(getStart(), getSize(), "", &CodeObject); |
| if (auto Err = |
| Plugin::check(Status, "Error in hsa_code_object_deserialize: %s")) |
| return Err; |
| |
| Status = hsa_executable_create_alt( |
| HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", &Executable); |
| if (auto Err = |
| Plugin::check(Status, "Error in hsa_executable_create_alt: %s")) |
| return Err; |
| |
| Status = hsa_executable_load_code_object(Executable, Device.getAgent(), |
| CodeObject, ""); |
| if (auto Err = |
| Plugin::check(Status, "Error in hsa_executable_load_code_object: %s")) |
| return Err; |
| |
| Status = hsa_executable_freeze(Executable, ""); |
| if (auto Err = Plugin::check(Status, "Error in hsa_executable_freeze: %s")) |
| return Err; |
| |
| uint32_t Result; |
| Status = hsa_executable_validate(Executable, &Result); |
| if (auto Err = Plugin::check(Status, "Error in hsa_executable_validate: %s")) |
| return Err; |
| |
| if (Result) |
| return Plugin::error("Loaded HSA executable does not validate"); |
| |
| if (auto Err = utils::readAMDGPUMetaDataFromImage( |
| getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| Expected<hsa_executable_symbol_t> |
| AMDGPUDeviceImageTy::findDeviceSymbol(GenericDeviceTy &Device, |
| StringRef SymbolName) const { |
| |
| AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
| hsa_agent_t Agent = AMDGPUDevice.getAgent(); |
| |
| hsa_executable_symbol_t Symbol; |
| hsa_status_t Status = hsa_executable_get_symbol_by_name( |
| Executable, SymbolName.data(), &Agent, &Symbol); |
| if (auto Err = Plugin::check( |
| Status, "Error in hsa_executable_get_symbol_by_name(%s): %s", |
| SymbolName.data())) |
| return std::move(Err); |
| |
| return Symbol; |
| } |
| |
| template <typename ResourceTy> |
| Error AMDGPUResourceRef<ResourceTy>::create(GenericDeviceTy &Device) { |
| if (Resource) |
| return Plugin::error("Creating an existing resource"); |
| |
| AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(Device); |
| |
| Resource = new ResourceTy(AMDGPUDevice); |
| |
| return Resource->init(); |
| } |
| |
| AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) |
| : Agent(Device.getAgent()), Queue(nullptr), |
| SignalManager(Device.getSignalManager()), Device(Device), |
| // Initialize the std::deque with some empty positions. |
| Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr), |
| StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()), |
| UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {} |
| |
| /// Class implementing the AMDGPU-specific functionalities of the global |
| /// handler. |
| struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy { |
| /// Get the metadata of a global from the device. The name and size of the |
| /// global is read from DeviceGlobal and the address of the global is written |
| /// to DeviceGlobal. |
| Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, |
| DeviceImageTy &Image, |
| GlobalTy &DeviceGlobal) override { |
| AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(Image); |
| |
| // Find the symbol on the device executable. |
| auto SymbolOrErr = |
| AMDImage.findDeviceSymbol(Device, DeviceGlobal.getName()); |
| if (!SymbolOrErr) |
| return SymbolOrErr.takeError(); |
| |
| hsa_executable_symbol_t Symbol = *SymbolOrErr; |
| hsa_symbol_kind_t SymbolType; |
| hsa_status_t Status; |
| uint64_t SymbolAddr; |
| uint32_t SymbolSize; |
| |
| // Retrieve the type, address and size of the symbol. |
| std::pair<hsa_executable_symbol_info_t, void *> RequiredInfos[] = { |
| {HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &SymbolType}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &SymbolAddr}, |
| {HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &SymbolSize}}; |
| |
| for (auto &Info : RequiredInfos) { |
| Status = hsa_executable_symbol_get_info(Symbol, Info.first, Info.second); |
| if (auto Err = Plugin::check( |
| Status, "Error in hsa_executable_symbol_get_info: %s")) |
| return Err; |
| } |
| |
| // Check the size of the symbol. |
| if (SymbolSize != DeviceGlobal.getSize()) |
| return Plugin::error( |
| "Failed to load global '%s' due to size mismatch (%zu != %zu)", |
| DeviceGlobal.getName().data(), SymbolSize, |
| (size_t)DeviceGlobal.getSize()); |
| |
| // Store the symbol address on the device global metadata. |
| DeviceGlobal.setPtr(reinterpret_cast<void *>(SymbolAddr)); |
| |
| return Plugin::success(); |
| } |
| }; |
| |
| /// Class implementing the AMDGPU-specific functionalities of the plugin. |
| struct AMDGPUPluginTy final : public GenericPluginTy { |
| /// Create an AMDGPU plugin and initialize the AMDGPU driver. |
| AMDGPUPluginTy() |
| : GenericPluginTy(getTripleArch()), Initialized(false), |
| HostDevice(nullptr) {} |
| |
| /// This class should not be copied. |
| AMDGPUPluginTy(const AMDGPUPluginTy &) = delete; |
| AMDGPUPluginTy(AMDGPUPluginTy &&) = delete; |
| |
| /// Initialize the plugin and return the number of devices. |
| Expected<int32_t> initImpl() override { |
| hsa_status_t Status = hsa_init(); |
| if (Status != HSA_STATUS_SUCCESS) { |
| // Cannot call hsa_success_string. |
| DP("Failed to initialize AMDGPU's HSA library\n"); |
| return 0; |
| } |
| |
| // The initialization of HSA was successful. It should be safe to call |
| // HSA functions from now on, e.g., hsa_shut_down. |
| Initialized = true; |
| |
| #ifdef OMPT_SUPPORT |
| ompt::connectLibrary(); |
| #endif |
| |
| // Register event handler to detect memory errors on the devices. |
| Status = hsa_amd_register_system_event_handler(eventHandler, nullptr); |
| if (auto Err = Plugin::check( |
| Status, "Error in hsa_amd_register_system_event_handler: %s")) |
| return std::move(Err); |
| |
| // List of host (CPU) agents. |
| llvm::SmallVector<hsa_agent_t> HostAgents; |
| |
| // Count the number of available agents. |
| auto Err = utils::iterateAgents([&](hsa_agent_t Agent) { |
| // Get the device type of the agent. |
| hsa_device_type_t DeviceType; |
| hsa_status_t Status = |
| hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
| if (Status != HSA_STATUS_SUCCESS) |
| return Status; |
| |
| // Classify the agents into kernel (GPU) and host (CPU) kernels. |
| if (DeviceType == HSA_DEVICE_TYPE_GPU) { |
| // Ensure that the GPU agent supports kernel dispatch packets. |
| hsa_agent_feature_t Features; |
| Status = hsa_agent_get_info(Agent, HSA_AGENT_INFO_FEATURE, &Features); |
| if (Features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) |
| KernelAgents.push_back(Agent); |
| } else if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
| HostAgents.push_back(Agent); |
| } |
| return HSA_STATUS_SUCCESS; |
| }); |
| |
| if (Err) |
| return std::move(Err); |
| |
| int32_t NumDevices = KernelAgents.size(); |
| if (NumDevices == 0) { |
| // Do not initialize if there are no devices. |
| DP("There are no devices supporting AMDGPU.\n"); |
| return 0; |
| } |
| |
| // There are kernel agents but there is no host agent. That should be |
| // treated as an error. |
| if (HostAgents.empty()) |
| return Plugin::error("No AMDGPU host agents"); |
| |
| // Initialize the host device using host agents. |
| HostDevice = allocate<AMDHostDeviceTy>(); |
| new (HostDevice) AMDHostDeviceTy(HostAgents); |
| |
| // Setup the memory pools of available for the host. |
| if (auto Err = HostDevice->init()) |
| return std::move(Err); |
| |
| return NumDevices; |
| } |
| |
| /// Deinitialize the plugin. |
| Error deinitImpl() override { |
| // The HSA runtime was not initialized, so nothing from the plugin was |
| // actually initialized. |
| if (!Initialized) |
| return Plugin::success(); |
| |
| if (HostDevice) |
| if (auto Err = HostDevice->deinit()) |
| return Err; |
| |
| // Finalize the HSA runtime. |
| hsa_status_t Status = hsa_shut_down(); |
| return Plugin::check(Status, "Error in hsa_shut_down: %s"); |
| } |
| |
| Triple::ArchType getTripleArch() const override { return Triple::amdgcn; } |
| |
| /// Get the ELF code for recognizing the compatible image binary. |
| uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } |
| |
| /// Check whether the image is compatible with an AMDGPU device. |
| Expected<bool> isELFCompatible(StringRef Image) const override { |
| // Get the associated architecture and flags from the ELF. |
| auto ElfOrErr = |
| ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""), |
| /*InitContent=*/false); |
| if (!ElfOrErr) |
| return ElfOrErr.takeError(); |
| std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); |
| |
| for (hsa_agent_t Agent : KernelAgents) { |
| std::string Target; |
| auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) { |
| uint32_t Length; |
| hsa_status_t Status; |
| Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length); |
| if (Status != HSA_STATUS_SUCCESS) |
| return Status; |
| |
| llvm::SmallVector<char> ISAName(Length); |
| Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin()); |
| if (Status != HSA_STATUS_SUCCESS) |
| return Status; |
| |
| llvm::StringRef TripleTarget(ISAName.begin(), Length); |
| if (TripleTarget.consume_front("amdgcn-amd-amdhsa")) |
| Target = TripleTarget.ltrim('-').rtrim('\0').str(); |
| return HSA_STATUS_SUCCESS; |
| }); |
| if (Err) |
| return std::move(Err); |
| |
| if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "", |
| ElfOrErr->getPlatformFlags(), |
| Target)) |
| return false; |
| } |
| return true; |
| } |
| |
| bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { |
| return true; |
| } |
| |
| /// Get the host device instance. |
| AMDHostDeviceTy &getHostDevice() { |
| assert(HostDevice && "Host device not initialized"); |
| return *HostDevice; |
| } |
| |
| /// Get the kernel agent with the corresponding agent id. |
| hsa_agent_t getKernelAgent(int32_t AgentId) const { |
| assert((uint32_t)AgentId < KernelAgents.size() && "Invalid agent id"); |
| return KernelAgents[AgentId]; |
| } |
| |
| /// Get the list of the available kernel agents. |
| const llvm::SmallVector<hsa_agent_t> &getKernelAgents() const { |
| return KernelAgents; |
| } |
| |
| private: |
| /// Event handler that will be called by ROCr if an event is detected. |
| static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) { |
| if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) |
| return HSA_STATUS_SUCCESS; |
| |
| SmallVector<std::string> Reasons; |
| uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) |
| Reasons.emplace_back("Page not present or supervisor privilege"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) |
| Reasons.emplace_back("Write access to a read-only page"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) |
| Reasons.emplace_back("Execute access to a page marked NX"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) |
| Reasons.emplace_back("GPU attempted access to a host only page"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) |
| Reasons.emplace_back("DRAM ECC failure"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) |
| Reasons.emplace_back("Can't determine the exact fault address"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) |
| Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)"); |
| if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) |
| Reasons.emplace_back("GPU reset following unspecified hang"); |
| |
| // If we do not know the reason, say so, otherwise remove the trailing comma |
| // and space. |
| if (Reasons.empty()) |
| Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")"); |
| |
| uint32_t Node = -1; |
| hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); |
| |
| // Abort the execution since we do not recover from this error. |
| FATAL_MESSAGE(1, |
| "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 |
| ") at virtual address %p. Reasons: %s", |
| Node, Event->memory_fault.agent.handle, |
| (void *)Event->memory_fault.virtual_address, |
| llvm::join(Reasons, ", ").c_str()); |
| |
| return HSA_STATUS_ERROR; |
| } |
| |
| /// Indicate whether the HSA runtime was correctly initialized. Even if there |
| /// is no available devices this boolean will be true. It indicates whether |
| /// we can safely call HSA functions (e.g., hsa_shut_down). |
| bool Initialized; |
| |
| /// Arrays of the available GPU and CPU agents. These arrays of handles should |
| /// not be here but in the AMDGPUDeviceTy structures directly. However, the |
| /// HSA standard does not provide API functions to retirve agents directly, |
| /// only iterating functions. We cache the agents here for convenience. |
| llvm::SmallVector<hsa_agent_t> KernelAgents; |
| |
| /// The device representing all HSA host agents. |
| AMDHostDeviceTy *HostDevice; |
| }; |
| |
| Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
| uint32_t NumThreads, uint64_t NumBlocks, |
| KernelArgsTy &KernelArgs, void *Args, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
| const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *); |
| |
| if (ArgsSize < KernelArgsSize) |
| return Plugin::error("Mismatch of kernel arguments size"); |
| |
| // The args size reported by HSA may or may not contain the implicit args. |
| // For now, assume that HSA does not consider the implicit arguments when |
| // reporting the arguments of a kernel. In the worst case, we can waste |
| // 56 bytes per allocation. |
| uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize; |
| |
| AMDHostDeviceTy &HostDevice = Plugin::get<AMDGPUPluginTy>().getHostDevice(); |
| AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); |
| |
| void *AllArgs = nullptr; |
| if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs)) |
| return Err; |
| |
| // Account for user requested dynamic shared memory. |
| uint32_t GroupSize = getGroupSize(); |
| if (uint32_t MaxDynCGroupMem = std::max( |
| KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { |
| GroupSize += MaxDynCGroupMem; |
| } |
| |
| uint64_t StackSize; |
| if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) |
| return Err; |
| |
| // Initialize implicit arguments. |
| utils::AMDGPUImplicitArgsTy *ImplArgs = |
| reinterpret_cast<utils::AMDGPUImplicitArgsTy *>( |
| advanceVoidPtr(AllArgs, KernelArgsSize)); |
| |
| // Initialize the implicit arguments to zero. |
| std::memset(ImplArgs, 0, ImplicitArgsSize); |
| |
| // Copy the explicit arguments. |
| // TODO: We should expose the args memory manager alloc to the common part as |
| // alternative to copying them twice. |
| if (KernelArgs.NumArgs) |
| std::memcpy(AllArgs, *static_cast<void **>(Args), |
| sizeof(void *) * KernelArgs.NumArgs); |
| |
| AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice); |
| |
| AMDGPUStreamTy *Stream = nullptr; |
| if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| // If this kernel requires an RPC server we attach its pointer to the stream. |
| if (GenericDevice.getRPCServer()) |
| Stream->setRPCServer(GenericDevice.getRPCServer()); |
| |
| // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. |
| if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { |
| ImplArgs->BlockCountX = NumBlocks; |
| ImplArgs->BlockCountY = 1; |
| ImplArgs->BlockCountZ = 1; |
| ImplArgs->GroupSizeX = NumThreads; |
| ImplArgs->GroupSizeY = 1; |
| ImplArgs->GroupSizeZ = 1; |
| ImplArgs->GridDims = 1; |
| } |
| |
| // Push the kernel launch into the stream. |
| return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, |
| GroupSize, StackSize, ArgsMemoryManager); |
| } |
| |
| Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, |
| KernelArgsTy &KernelArgs, |
| uint32_t NumThreads, |
| uint64_t NumBlocks) const { |
| // Only do all this when the output is requested |
| if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) |
| return Plugin::success(); |
| |
| // We don't have data to print additional info, but no hard error |
| if (!KernelInfo.has_value()) |
| return Plugin::success(); |
| |
| // General Info |
| auto NumGroups = NumBlocks; |
| auto ThreadsPerGroup = NumThreads; |
| |
| // Kernel Arguments Info |
| auto ArgNum = KernelArgs.NumArgs; |
| auto LoopTripCount = KernelArgs.Tripcount; |
| |
| // Details for AMDGPU kernels (read from image) |
| // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata |
| auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; |
| auto SGPRCount = (*KernelInfo).SGPRCount; |
| auto VGPRCount = (*KernelInfo).VGPRCount; |
| auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; |
| auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; |
| auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; |
| |
| // Prints additional launch info that contains the following. |
| // Num Args: The number of kernel arguments |
| // Teams x Thrds: The number of teams and the number of threads actually |
| // running. |
| // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the |
| // kernel in work-items |
| // LDS Usage: Amount of bytes used in LDS storage |
| // S/VGPR Count: the number of S/V GPRs occupied by the kernel |
| // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel |
| // Tripcount: loop tripcount for the kernel |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), |
| "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS " |
| "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " |
| "%lu\n", |
| ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize, |
| GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount, |
| LoopTripCount); |
| |
| return Plugin::success(); |
| } |
| |
| GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); } |
| |
| GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { |
| AMDGPUPluginTy &Plugin = get<AMDGPUPluginTy &>(); |
| return new AMDGPUDeviceTy(DeviceId, NumDevices, Plugin.getHostDevice(), |
| Plugin.getKernelAgent(DeviceId)); |
| } |
| |
| GenericGlobalHandlerTy *Plugin::createGlobalHandler() { |
| return new AMDGPUGlobalHandlerTy(); |
| } |
| |
| template <typename... ArgsTy> |
| Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { |
| hsa_status_t ResultCode = static_cast<hsa_status_t>(Code); |
| if (ResultCode == HSA_STATUS_SUCCESS || ResultCode == HSA_STATUS_INFO_BREAK) |
| return Error::success(); |
| |
| const char *Desc = "Unknown error"; |
| hsa_status_t Ret = hsa_status_string(ResultCode, &Desc); |
| if (Ret != HSA_STATUS_SUCCESS) |
| REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); |
| |
| return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(), |
| ErrFmt, Args..., Desc); |
| } |
| |
| void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, |
| TargetAllocTy Kind) { |
| // Allocate memory from the pool. |
| void *Ptr = nullptr; |
| if (auto Err = MemoryPool->allocate(Size, &Ptr)) { |
| consumeError(std::move(Err)); |
| return nullptr; |
| } |
| assert(Ptr && "Invalid pointer"); |
| |
| auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents(); |
| |
| // Allow all kernel agents to access the allocation. |
| if (auto Err = MemoryPool->enableAccess(Ptr, Size, KernelAgents)) { |
| REPORT("%s\n", toString(std::move(Err)).data()); |
| return nullptr; |
| } |
| return Ptr; |
| } |
| |
| void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { |
| if (Size == 0) |
| return nullptr; |
| |
| // Find the correct memory pool. |
| AMDGPUMemoryPoolTy *MemoryPool = nullptr; |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| case TARGET_ALLOC_DEVICE_NON_BLOCKING: |
| MemoryPool = CoarseGrainedMemoryPools[0]; |
| break; |
| case TARGET_ALLOC_HOST: |
| MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| break; |
| case TARGET_ALLOC_SHARED: |
| MemoryPool = &HostDevice.getFineGrainedMemoryPool(); |
| break; |
| } |
| |
| if (!MemoryPool) { |
| REPORT("No memory pool for the specified allocation kind\n"); |
| return nullptr; |
| } |
| |
| // Allocate from the corresponding memory pool. |
| void *Alloc = nullptr; |
| if (Error Err = MemoryPool->allocate(Size, &Alloc)) { |
| REPORT("%s\n", toString(std::move(Err)).data()); |
| return nullptr; |
| } |
| |
| if (Alloc) { |
| auto &KernelAgents = Plugin::get<AMDGPUPluginTy>().getKernelAgents(); |
| // Inherently necessary for host or shared allocations |
| // Also enabled for device memory to allow device to device memcpy |
| |
| // Enable all kernel agents to access the buffer. |
| if (auto Err = MemoryPool->enableAccess(Alloc, Size, KernelAgents)) { |
| REPORT("%s\n", toString(std::move(Err)).data()); |
| return nullptr; |
| } |
| } |
| |
| return Alloc; |
| } |
| |
| } // namespace plugin |
| } // namespace target |
| } // namespace omp |
| } // namespace llvm |