| //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // RTL for CUDA machine |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cuda.h> |
| #include <list> |
| #include <memory> |
| #include <mutex> |
| #include <string> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "Debug.h" |
| #include "DeviceEnvironment.h" |
| #include "omptargetplugin.h" |
| |
| #define TARGET_NAME CUDA |
| #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" |
| |
| #include "MemoryManager.h" |
| |
| #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| |
| // Utility for retrieving and printing CUDA error string. |
| #ifdef OMPTARGET_DEBUG |
| #define CUDA_ERR_STRING(err) \ |
| do { \ |
| if (getDebugLevel() > 0) { \ |
| const char *errStr = nullptr; \ |
| CUresult errStr_status = cuGetErrorString(err, &errStr); \ |
| if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ |
| REPORT("Unrecognized CUDA error code: %d\n", err); \ |
| else if (errStr_status == CUDA_SUCCESS) \ |
| REPORT("CUDA error is: %s\n", errStr); \ |
| else { \ |
| REPORT("Unresolved CUDA error code: %d\n", err); \ |
| REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ |
| errStr_status); \ |
| } \ |
| } else { \ |
| const char *errStr = nullptr; \ |
| CUresult errStr_status = cuGetErrorString(err, &errStr); \ |
| if (errStr_status == CUDA_SUCCESS) \ |
| REPORT("%s \n", errStr); \ |
| } \ |
| } while (false) |
| #else // OMPTARGET_DEBUG |
| #define CUDA_ERR_STRING(err) \ |
| do { \ |
| const char *errStr = nullptr; \ |
| CUresult errStr_status = cuGetErrorString(err, &errStr); \ |
| if (errStr_status == CUDA_SUCCESS) \ |
| REPORT("%s \n", errStr); \ |
| } while (false) |
| #endif // OMPTARGET_DEBUG |
| |
| #define BOOL2TEXT(b) ((b) ? "Yes" : "No") |
| |
| #include "elf_common.h" |
| |
| /// Keep entries table per device. |
| struct FuncOrGblEntryTy { |
| __tgt_target_table Table; |
| std::vector<__tgt_offload_entry> Entries; |
| }; |
| |
| /// Use a single entity to encode a kernel and a set of flags. |
| struct KernelTy { |
| CUfunction Func; |
| |
| // execution mode of kernel |
| llvm::omp::OMPTgtExecModeFlags ExecutionMode; |
| |
| /// Maximal number of threads per block for this kernel. |
| int MaxThreadsPerBlock = 0; |
| |
| KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) |
| : Func(_Func), ExecutionMode(_ExecutionMode) {} |
| }; |
| |
| namespace { |
| bool checkResult(CUresult Err, const char *ErrMsg) { |
| if (Err == CUDA_SUCCESS) |
| return true; |
| |
| REPORT("%s", ErrMsg); |
| CUDA_ERR_STRING(Err); |
| return false; |
| } |
| |
| int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, |
| CUstream Stream) { |
| CUresult Err = |
| cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); |
| |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when copying data from device to device. Pointers: src " |
| "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", |
| DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int createEvent(void **P) { |
| CUevent Event = nullptr; |
| |
| CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| *P = Event; |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { |
| CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| CUresult Err = cuEventRecord(Event, Stream); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", |
| DPxPTR(Stream), DPxPTR(Event)); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int syncEvent(void *EventPtr) { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| CUresult Err = cuEventSynchronize(Event); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int destroyEvent(void *EventPtr) { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| CUresult Err = cuEventDestroy(Event); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| // Structure contains per-device data |
| struct DeviceDataTy { |
| /// List that contains all the kernels. |
| std::list<KernelTy> KernelsList; |
| |
| std::list<FuncOrGblEntryTy> FuncGblEntries; |
| |
| CUcontext Context = nullptr; |
| // Device properties |
| int ThreadsPerBlock = 0; |
| int BlocksPerGrid = 0; |
| int WarpSize = 0; |
| // OpenMP properties |
| int NumTeams = 0; |
| int NumThreads = 0; |
| }; |
| |
| class StreamManagerTy { |
| int NumberOfDevices; |
| // The initial size of stream pool |
| int EnvNumInitialStreams; |
| // Per-device stream mutex |
| std::vector<std::unique_ptr<std::mutex>> StreamMtx; |
| // Per-device stream Id indicates the next available stream in the pool |
| std::vector<int> NextStreamId; |
| // Per-device stream pool |
| std::vector<std::vector<CUstream>> StreamPool; |
| // Reference to per-device data |
| std::vector<DeviceDataTy> &DeviceData; |
| |
| // If there is no CUstream left in the pool, we will resize the pool to |
| // allocate more CUstream. This function should be called with device mutex, |
| // and we do not resize to smaller one. |
| void resizeStreamPool(const int DeviceId, const size_t NewSize) { |
| std::vector<CUstream> &Pool = StreamPool[DeviceId]; |
| const size_t CurrentSize = Pool.size(); |
| assert(NewSize > CurrentSize && "new size is not larger than current size"); |
| |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { |
| // We will return if cannot switch to the right context in case of |
| // creating bunch of streams that are not corresponding to the right |
| // device. The offloading will fail later because selected CUstream is |
| // nullptr. |
| return; |
| } |
| |
| Pool.resize(NewSize, nullptr); |
| |
| for (size_t I = CurrentSize; I < NewSize; ++I) { |
| checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), |
| "Error returned from cuStreamCreate\n"); |
| } |
| } |
| |
| public: |
| StreamManagerTy(const int NumberOfDevices, |
| std::vector<DeviceDataTy> &DeviceData) |
| : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), |
| DeviceData(DeviceData) { |
| StreamPool.resize(NumberOfDevices); |
| NextStreamId.resize(NumberOfDevices); |
| StreamMtx.resize(NumberOfDevices); |
| |
| if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) |
| EnvNumInitialStreams = std::stoi(EnvStr); |
| |
| // Initialize the next stream id |
| std::fill(NextStreamId.begin(), NextStreamId.end(), 0); |
| |
| // Initialize stream mutex |
| for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) |
| Ptr = std::make_unique<std::mutex>(); |
| } |
| |
| ~StreamManagerTy() { |
| // Destroy streams |
| for (int I = 0; I < NumberOfDevices; ++I) { |
| checkResult(cuCtxSetCurrent(DeviceData[I].Context), |
| "Error returned from cuCtxSetCurrent\n"); |
| |
| for (CUstream &S : StreamPool[I]) { |
| if (S) |
| checkResult(cuStreamDestroy(S), |
| "Error returned from cuStreamDestroy\n"); |
| } |
| } |
| } |
| |
| // Get a CUstream from pool. Per-device next stream id always points to the |
| // next available CUstream. That means, CUstreams [0, id-1] have been |
| // assigned, and [id,] are still available. If there is no CUstream left, we |
| // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, |
| // the id will increase one. |
| // xxxxxs+++++++++ |
| // ^ |
| // id |
| // After assignment, the pool becomes the following and s is assigned. |
| // xxxxxs+++++++++ |
| // ^ |
| // id |
| CUstream getStream(const int DeviceId) { |
| const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); |
| int &Id = NextStreamId[DeviceId]; |
| // No CUstream left in the pool, we need to request from CUDA RT |
| if (Id == static_cast<int>(StreamPool[DeviceId].size())) { |
| // By default we double the stream pool every time |
| resizeStreamPool(DeviceId, Id * 2); |
| } |
| return StreamPool[DeviceId][Id++]; |
| } |
| |
| // Return a CUstream back to pool. As mentioned above, per-device next |
| // stream is always points to the next available CUstream, so when we return |
| // a CUstream, we need to first decrease the id, and then copy the CUstream |
| // back. |
| // It is worth noting that, the order of streams return might be different |
| // from that they're assigned, that saying, at some point, there might be |
| // two identical CUstreams. |
| // xxax+a+++++ |
| // ^ |
| // id |
| // However, it doesn't matter, because they're always on the two sides of |
| // id. The left one will in the end be overwritten by another CUstream. |
| // Therefore, after several execution, the order of pool might be different |
| // from its initial state. |
| void returnStream(const int DeviceId, CUstream Stream) { |
| const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); |
| int &Id = NextStreamId[DeviceId]; |
| assert(Id > 0 && "Wrong stream ID"); |
| StreamPool[DeviceId][--Id] = Stream; |
| } |
| |
| bool initializeDeviceStreamPool(const int DeviceId) { |
| assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); |
| |
| resizeStreamPool(DeviceId, EnvNumInitialStreams); |
| |
| // Check the size of stream pool |
| if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) |
| return false; |
| |
| // Check whether each stream is valid |
| for (CUstream &S : StreamPool[DeviceId]) |
| if (!S) |
| return false; |
| |
| return true; |
| } |
| }; |
| |
| class DeviceRTLTy { |
| int NumberOfDevices; |
| // OpenMP environment properties |
| int EnvNumTeams; |
| int EnvTeamLimit; |
| int EnvTeamThreadLimit; |
| // OpenMP requires flags |
| int64_t RequiresFlags; |
| // Amount of dynamic shared memory to use at launch. |
| uint64_t DynamicMemorySize; |
| |
| static constexpr const int HardTeamLimit = 1U << 16U; // 64k |
| static constexpr const int HardThreadLimit = 1024; |
| static constexpr const int DefaultNumTeams = 128; |
| static constexpr const int DefaultNumThreads = 128; |
| |
| std::unique_ptr<StreamManagerTy> StreamManager; |
| std::vector<DeviceDataTy> DeviceData; |
| std::vector<CUmodule> Modules; |
| |
| /// A class responsible for interacting with device native runtime library to |
| /// allocate and free memory. |
| class CUDADeviceAllocatorTy : public DeviceAllocatorTy { |
| const int DeviceId; |
| const std::vector<DeviceDataTy> &DeviceData; |
| std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; |
| |
| public: |
| CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) |
| : DeviceId(DeviceId), DeviceData(DeviceData) {} |
| |
| void *allocate(size_t Size, void *, TargetAllocTy Kind) override { |
| if (Size == 0) |
| return nullptr; |
| |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return nullptr; |
| |
| void *MemAlloc = nullptr; |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| CUdeviceptr DevicePtr; |
| Err = cuMemAlloc(&DevicePtr, Size); |
| MemAlloc = (void *)DevicePtr; |
| if (!checkResult(Err, "Error returned from cuMemAlloc\n")) |
| return nullptr; |
| break; |
| case TARGET_ALLOC_HOST: |
| void *HostPtr; |
| Err = cuMemAllocHost(&HostPtr, Size); |
| MemAlloc = HostPtr; |
| if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) |
| return nullptr; |
| HostPinnedAllocs[MemAlloc] = Kind; |
| break; |
| case TARGET_ALLOC_SHARED: |
| CUdeviceptr SharedPtr; |
| Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); |
| MemAlloc = (void *)SharedPtr; |
| if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) |
| return nullptr; |
| break; |
| } |
| |
| return MemAlloc; |
| } |
| |
| int free(void *TgtPtr) override { |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| // Host pinned memory must be freed differently. |
| TargetAllocTy Kind = |
| (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) |
| ? TARGET_ALLOC_DEFAULT |
| : TARGET_ALLOC_HOST; |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| case TARGET_ALLOC_SHARED: |
| Err = cuMemFree((CUdeviceptr)TgtPtr); |
| if (!checkResult(Err, "Error returned from cuMemFree\n")) |
| return OFFLOAD_FAIL; |
| break; |
| case TARGET_ALLOC_HOST: |
| Err = cuMemFreeHost(TgtPtr); |
| if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) |
| return OFFLOAD_FAIL; |
| break; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| }; |
| |
| /// A vector of device allocators |
| std::vector<CUDADeviceAllocatorTy> DeviceAllocators; |
| |
| /// A vector of memory managers. Since the memory manager is non-copyable and |
| // non-removable, we wrap them into std::unique_ptr. |
| std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; |
| |
| /// Whether use memory manager |
| bool UseMemoryManager = true; |
| |
| // Record entry point associated with device |
| void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { |
| FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); |
| E.Entries.push_back(entry); |
| } |
| |
| // Return a pointer to the entry associated with the pointer |
| const __tgt_offload_entry *getOffloadEntry(const int DeviceId, |
| const void *Addr) const { |
| for (const __tgt_offload_entry &Itr : |
| DeviceData[DeviceId].FuncGblEntries.back().Entries) |
| if (Itr.addr == Addr) |
| return &Itr; |
| |
| return nullptr; |
| } |
| |
| // Return the pointer to the target entries table |
| __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { |
| FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); |
| |
| if (E.Entries.empty()) |
| return nullptr; |
| |
| // Update table info according to the entries and return the pointer |
| E.Table.EntriesBegin = E.Entries.data(); |
| E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); |
| |
| return &E.Table; |
| } |
| |
| // Clear entries table for a device |
| void clearOffloadEntriesTable(const int DeviceId) { |
| DeviceData[DeviceId].FuncGblEntries.emplace_back(); |
| FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); |
| E.Entries.clear(); |
| E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; |
| } |
| |
| CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { |
| assert(AsyncInfo && "AsyncInfo is nullptr"); |
| |
| if (!AsyncInfo->Queue) |
| AsyncInfo->Queue = StreamManager->getStream(DeviceId); |
| |
| return reinterpret_cast<CUstream>(AsyncInfo->Queue); |
| } |
| |
| public: |
| // This class should not be copied |
| DeviceRTLTy(const DeviceRTLTy &) = delete; |
| DeviceRTLTy(DeviceRTLTy &&) = delete; |
| |
| DeviceRTLTy() |
| : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), |
| EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), |
| DynamicMemorySize(0) { |
| |
| DP("Start initializing CUDA\n"); |
| |
| CUresult Err = cuInit(0); |
| if (Err == CUDA_ERROR_INVALID_HANDLE) { |
| // Can't call cuGetErrorString if dlsym failed |
| DP("Failed to load CUDA shared library\n"); |
| return; |
| } |
| if (!checkResult(Err, "Error returned from cuInit\n")) { |
| return; |
| } |
| |
| Err = cuDeviceGetCount(&NumberOfDevices); |
| if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) |
| return; |
| |
| if (NumberOfDevices == 0) { |
| DP("There are no devices supporting CUDA.\n"); |
| return; |
| } |
| |
| DeviceData.resize(NumberOfDevices); |
| |
| // Get environment variables regarding teams |
| if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { |
| // OMP_TEAM_LIMIT has been set |
| EnvTeamLimit = std::stoi(EnvStr); |
| DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); |
| } |
| if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { |
| // OMP_TEAMS_THREAD_LIMIT has been set |
| EnvTeamThreadLimit = std::stoi(EnvStr); |
| DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); |
| } |
| if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { |
| // OMP_NUM_TEAMS has been set |
| EnvNumTeams = std::stoi(EnvStr); |
| DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); |
| } |
| if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { |
| // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set |
| DynamicMemorySize = std::stoi(EnvStr); |
| DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", |
| DynamicMemorySize); |
| } |
| |
| StreamManager = |
| std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); |
| |
| for (int I = 0; I < NumberOfDevices; ++I) |
| DeviceAllocators.emplace_back(I, DeviceData); |
| |
| // Get the size threshold from environment variable |
| std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); |
| UseMemoryManager = Res.second; |
| size_t MemoryManagerThreshold = Res.first; |
| |
| if (UseMemoryManager) |
| for (int I = 0; I < NumberOfDevices; ++I) |
| MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( |
| DeviceAllocators[I], MemoryManagerThreshold)); |
| } |
| |
| ~DeviceRTLTy() { |
| // We first destruct memory managers in case that its dependent data are |
| // destroyed before it. |
| for (auto &M : MemoryManagers) |
| M.release(); |
| |
| StreamManager = nullptr; |
| |
| for (CUmodule &M : Modules) |
| // Close module |
| if (M) |
| checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); |
| |
| for (DeviceDataTy &D : DeviceData) { |
| // Destroy context |
| if (D.Context) { |
| checkResult(cuCtxSetCurrent(D.Context), |
| "Error returned from cuCtxSetCurrent\n"); |
| CUdevice Device; |
| checkResult(cuCtxGetDevice(&Device), |
| "Error returned from cuCtxGetDevice\n"); |
| checkResult(cuDevicePrimaryCtxRelease(Device), |
| "Error returned from cuDevicePrimaryCtxRelease\n"); |
| } |
| } |
| } |
| |
| // Check whether a given DeviceId is valid |
| bool isValidDeviceId(const int DeviceId) const { |
| return DeviceId >= 0 && DeviceId < NumberOfDevices; |
| } |
| |
| int getNumOfDevices() const { return NumberOfDevices; } |
| |
| void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } |
| |
| int initDevice(const int DeviceId) { |
| CUdevice Device; |
| |
| DP("Getting device %d\n", DeviceId); |
| CUresult Err = cuDeviceGet(&Device, DeviceId); |
| if (!checkResult(Err, "Error returned from cuDeviceGet\n")) |
| return OFFLOAD_FAIL; |
| |
| // Query the current flags of the primary context and set its flags if |
| // it is inactive |
| unsigned int FormerPrimaryCtxFlags = 0; |
| int FormerPrimaryCtxIsActive = 0; |
| Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, |
| &FormerPrimaryCtxIsActive); |
| if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) |
| return OFFLOAD_FAIL; |
| |
| if (FormerPrimaryCtxIsActive) { |
| DP("The primary context is active, no change to its flags\n"); |
| if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != |
| CU_CTX_SCHED_BLOCKING_SYNC) |
| DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); |
| } else { |
| DP("The primary context is inactive, set its flags to " |
| "CU_CTX_SCHED_BLOCKING_SYNC\n"); |
| Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); |
| if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) |
| return OFFLOAD_FAIL; |
| } |
| |
| // Retain the per device primary context and save it to use whenever this |
| // device is selected. |
| Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); |
| if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) |
| return OFFLOAD_FAIL; |
| |
| Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| // Initialize stream pool |
| if (!StreamManager->initializeDeviceStreamPool(DeviceId)) |
| return OFFLOAD_FAIL; |
| |
| // Query attributes to determine number of threads/block and blocks/grid. |
| int MaxGridDimX; |
| Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, |
| Device); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error getting max grid dimension, use default value %d\n", |
| DeviceRTLTy::DefaultNumTeams); |
| DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; |
| } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { |
| DP("Using %d CUDA blocks per grid\n", MaxGridDimX); |
| DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; |
| } else { |
| DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " |
| "at the hard limit\n", |
| MaxGridDimX, DeviceRTLTy::HardTeamLimit); |
| DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; |
| } |
| |
| // We are only exploiting threads along the x axis. |
| int MaxBlockDimX; |
| Err = cuDeviceGetAttribute(&MaxBlockDimX, |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error getting max block dimension, use default value %d\n", |
| DeviceRTLTy::DefaultNumThreads); |
| DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; |
| } else { |
| DP("Using %d CUDA threads per block\n", MaxBlockDimX); |
| DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; |
| |
| if (EnvTeamThreadLimit > 0 && |
| DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { |
| DP("Max CUDA threads per block %d exceeds the thread limit %d set by " |
| "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", |
| DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); |
| DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; |
| } |
| if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { |
| DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " |
| "capping at the hard limit\n", |
| DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); |
| DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; |
| } |
| } |
| |
| // Get and set warp size |
| int WarpSize; |
| Err = |
| cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error getting warp size, assume default value 32\n"); |
| DeviceData[DeviceId].WarpSize = 32; |
| } else { |
| DP("Using warp size %d\n", WarpSize); |
| DeviceData[DeviceId].WarpSize = WarpSize; |
| } |
| |
| // Adjust teams to the env variables |
| if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { |
| DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", |
| EnvTeamLimit); |
| DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; |
| } |
| |
| size_t StackLimit; |
| size_t HeapLimit; |
| if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { |
| StackLimit = std::stol(EnvStr); |
| if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) |
| return OFFLOAD_FAIL; |
| } else { |
| if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) |
| return OFFLOAD_FAIL; |
| } |
| if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { |
| HeapLimit = std::stol(EnvStr); |
| if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) |
| return OFFLOAD_FAIL; |
| } else { |
| if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) |
| return OFFLOAD_FAIL; |
| } |
| |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Device supports up to %d CUDA blocks and %d threads with a " |
| "warp size of %d\n", |
| DeviceData[DeviceId].BlocksPerGrid, |
| DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Device heap size is %d Bytes, device stack size is %d Bytes per " |
| "thread\n", |
| (int)HeapLimit, (int)StackLimit); |
| |
| // Set default number of teams |
| if (EnvNumTeams > 0) { |
| DP("Default number of teams set according to environment %d\n", |
| EnvNumTeams); |
| DeviceData[DeviceId].NumTeams = EnvNumTeams; |
| } else { |
| DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; |
| DP("Default number of teams set according to library's default %d\n", |
| DeviceRTLTy::DefaultNumTeams); |
| } |
| |
| if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { |
| DP("Default number of teams exceeds device limit, capping at %d\n", |
| DeviceData[DeviceId].BlocksPerGrid); |
| DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; |
| } |
| |
| // Set default number of threads |
| DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; |
| DP("Default number of threads set according to library's default %d\n", |
| DeviceRTLTy::DefaultNumThreads); |
| if (DeviceData[DeviceId].NumThreads > |
| DeviceData[DeviceId].ThreadsPerBlock) { |
| DP("Default number of threads exceeds device limit, capping at %d\n", |
| DeviceData[DeviceId].ThreadsPerBlock); |
| DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| __tgt_target_table *loadBinary(const int DeviceId, |
| const __tgt_device_image *Image) { |
| // Set the context we are using |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return nullptr; |
| |
| // Clear the offload table as we are going to create a new one. |
| clearOffloadEntriesTable(DeviceId); |
| |
| // Create the module and extract the function pointers. |
| CUmodule Module; |
| DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); |
| Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); |
| if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) |
| return nullptr; |
| |
| DP("CUDA module successfully loaded!\n"); |
| |
| Modules.push_back(Module); |
| |
| // Find the symbols in the module by name. |
| const __tgt_offload_entry *HostBegin = Image->EntriesBegin; |
| const __tgt_offload_entry *HostEnd = Image->EntriesEnd; |
| |
| std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; |
| for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { |
| if (!E->addr) { |
| // We return nullptr when something like this happens, the host should |
| // have always something in the address to uniquely identify the target |
| // region. |
| DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); |
| return nullptr; |
| } |
| |
| if (E->size) { |
| __tgt_offload_entry Entry = *E; |
| CUdeviceptr CUPtr; |
| size_t CUSize; |
| Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); |
| // We keep this style here because we need the name |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Loading global '%s' Failed\n", E->name); |
| CUDA_ERR_STRING(Err); |
| return nullptr; |
| } |
| |
| if (CUSize != E->size) { |
| DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, |
| CUSize, E->size); |
| return nullptr; |
| } |
| |
| DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", |
| DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); |
| |
| Entry.addr = (void *)(CUPtr); |
| |
| // Note: In the current implementation declare target variables |
| // can either be link or to. This means that once unified |
| // memory is activated via the requires directive, the variable |
| // can be used directly from the host in both cases. |
| // TODO: when variables types other than to or link are added, |
| // the below condition should be changed to explicitly |
| // check for to and link variables types: |
| // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & |
| // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) |
| if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { |
| // If unified memory is present any target link or to variables |
| // can access host addresses directly. There is no longer a |
| // need for device copies. |
| cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); |
| DP("Copy linked variable host address (" DPxMOD |
| ") to device address (" DPxMOD ")\n", |
| DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); |
| } |
| |
| addOffloadEntry(DeviceId, Entry); |
| |
| continue; |
| } |
| |
| CUfunction Func; |
| Err = cuModuleGetFunction(&Func, Module, E->name); |
| // We keep this style here because we need the name |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Loading '%s' Failed\n", E->name); |
| CUDA_ERR_STRING(Err); |
| return nullptr; |
| } |
| |
| DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", |
| DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); |
| |
| // default value GENERIC (in case symbol is missing from cubin file) |
| llvm::omp::OMPTgtExecModeFlags ExecModeVal; |
| std::string ExecModeNameStr(E->name); |
| ExecModeNameStr += "_exec_mode"; |
| const char *ExecModeName = ExecModeNameStr.c_str(); |
| |
| CUdeviceptr ExecModePtr; |
| size_t CUSize; |
| Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); |
| if (Err == CUDA_SUCCESS) { |
| if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { |
| DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", |
| ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); |
| return nullptr; |
| } |
| |
| Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Error when copying data from device to host. Pointers: " |
| "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", |
| DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); |
| CUDA_ERR_STRING(Err); |
| return nullptr; |
| } |
| } else { |
| DP("Loading global exec_mode '%s' - symbol missing, using default " |
| "value GENERIC (1)\n", |
| ExecModeName); |
| } |
| |
| KernelsList.emplace_back(Func, ExecModeVal); |
| |
| __tgt_offload_entry Entry = *E; |
| Entry.addr = &KernelsList.back(); |
| addOffloadEntry(DeviceId, Entry); |
| } |
| |
| // send device environment data to the device |
| { |
| // TODO: The device ID used here is not the real device ID used by OpenMP. |
| DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), |
| static_cast<uint32_t>(DeviceId), |
| static_cast<uint32_t>(DynamicMemorySize)}; |
| |
| if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) |
| DeviceEnv.DebugKind = std::stoi(EnvStr); |
| |
| const char *DeviceEnvName = "omptarget_device_environment"; |
| CUdeviceptr DeviceEnvPtr; |
| size_t CUSize; |
| |
| Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); |
| if (Err == CUDA_SUCCESS) { |
| if (CUSize != sizeof(DeviceEnv)) { |
| REPORT( |
| "Global device_environment '%s' - size mismatch (%zu != %zu)\n", |
| DeviceEnvName, CUSize, sizeof(int32_t)); |
| CUDA_ERR_STRING(Err); |
| return nullptr; |
| } |
| |
| Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Error when copying data from host to device. Pointers: " |
| "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", |
| DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); |
| CUDA_ERR_STRING(Err); |
| return nullptr; |
| } |
| |
| DP("Sending global device environment data %zu bytes\n", CUSize); |
| } else { |
| DP("Finding global device environment '%s' - symbol missing.\n", |
| DeviceEnvName); |
| DP("Continue, considering this is a device RTL which does not accept " |
| "environment setting.\n"); |
| } |
| } |
| |
| return getOffloadEntriesTable(DeviceId); |
| } |
| |
| void *dataAlloc(const int DeviceId, const int64_t Size, |
| const TargetAllocTy Kind) { |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| if (UseMemoryManager) |
| return MemoryManagers[DeviceId]->allocate(Size, nullptr); |
| else |
| return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); |
| case TARGET_ALLOC_HOST: |
| case TARGET_ALLOC_SHARED: |
| return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); |
| } |
| |
| REPORT("Invalid target data allocation kind or requested allocator not " |
| "implemented yet\n"); |
| |
| return nullptr; |
| } |
| |
| int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, |
| const int64_t Size, __tgt_async_info *AsyncInfo) const { |
| assert(AsyncInfo && "AsyncInfo is nullptr"); |
| |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| CUstream Stream = getStream(DeviceId, AsyncInfo); |
| |
| Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when copying data from host to device. Pointers: host " |
| "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", |
| DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, |
| const int64_t Size, __tgt_async_info *AsyncInfo) const { |
| assert(AsyncInfo && "AsyncInfo is nullptr"); |
| |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| CUstream Stream = getStream(DeviceId, AsyncInfo); |
| |
| Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when copying data from device to host. Pointers: host " |
| "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", |
| DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, |
| int64_t Size, __tgt_async_info *AsyncInfo) const { |
| assert(AsyncInfo && "AsyncInfo is nullptr"); |
| |
| CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| CUstream Stream = getStream(SrcDevId, AsyncInfo); |
| |
| // If they are two devices, we try peer to peer copy first |
| if (SrcDevId != DstDevId) { |
| int CanAccessPeer = 0; |
| Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 |
| ", dst = %" PRId32 "\n", |
| SrcDevId, DstDevId); |
| CUDA_ERR_STRING(Err); |
| return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); |
| } |
| |
| if (!CanAccessPeer) { |
| DP("P2P memcpy not supported so fall back to D2D memcpy"); |
| return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); |
| } |
| |
| Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); |
| if (Err != CUDA_SUCCESS) { |
| REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 |
| ", dst = %" PRId32 "\n", |
| SrcDevId, DstDevId); |
| CUDA_ERR_STRING(Err); |
| return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); |
| } |
| |
| Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, |
| (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, |
| Size, Stream); |
| if (Err == CUDA_SUCCESS) |
| return OFFLOAD_SUCCESS; |
| |
| DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD |
| ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", |
| DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); |
| CUDA_ERR_STRING(Err); |
| } |
| |
| return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); |
| } |
| |
| int dataDelete(const int DeviceId, void *TgtPtr) { |
| if (UseMemoryManager) |
| return MemoryManagers[DeviceId]->free(TgtPtr); |
| |
| return DeviceAllocators[DeviceId].free(TgtPtr); |
| } |
| |
| int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, |
| ptrdiff_t *TgtOffsets, const int ArgNum, |
| const int TeamNum, const int ThreadLimit, |
| const unsigned int LoopTripCount, |
| __tgt_async_info *AsyncInfo) const { |
| CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); |
| if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) |
| return OFFLOAD_FAIL; |
| |
| // All args are references. |
| std::vector<void *> Args(ArgNum); |
| std::vector<void *> Ptrs(ArgNum); |
| |
| for (int I = 0; I < ArgNum; ++I) { |
| Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); |
| Args[I] = &Ptrs[I]; |
| } |
| |
| KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); |
| |
| const bool IsSPMDGenericMode = |
| KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; |
| const bool IsSPMDMode = |
| KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; |
| const bool IsGenericMode = |
| KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; |
| |
| int CudaThreadsPerBlock; |
| if (ThreadLimit > 0) { |
| DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); |
| CudaThreadsPerBlock = ThreadLimit; |
| // Add master warp if necessary |
| if (IsGenericMode) { |
| DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); |
| CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; |
| } |
| } else { |
| DP("Setting CUDA threads per block to default %d\n", |
| DeviceData[DeviceId].NumThreads); |
| CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; |
| } |
| |
| if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { |
| DP("Threads per block capped at device limit %d\n", |
| DeviceData[DeviceId].ThreadsPerBlock); |
| CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; |
| } |
| |
| if (!KernelInfo->MaxThreadsPerBlock) { |
| Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, |
| CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, |
| KernelInfo->Func); |
| if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) |
| return OFFLOAD_FAIL; |
| } |
| |
| if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { |
| DP("Threads per block capped at kernel limit %d\n", |
| KernelInfo->MaxThreadsPerBlock); |
| CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; |
| } |
| |
| unsigned int CudaBlocksPerGrid; |
| if (TeamNum <= 0) { |
| if (LoopTripCount > 0 && EnvNumTeams < 0) { |
| if (IsSPMDGenericMode) { |
| // If we reach this point, then we are executing a kernel that was |
| // transformed from Generic-mode to SPMD-mode. This kernel has |
| // SPMD-mode execution, but needs its blocks to be scheduled |
| // differently because the current loop trip count only applies to the |
| // `teams distribute` region and will create var too few blocks using |
| // the regular SPMD-mode method. |
| CudaBlocksPerGrid = LoopTripCount; |
| } else if (IsSPMDMode) { |
| // We have a combined construct, i.e. `target teams distribute |
| // parallel for [simd]`. We launch so many teams so that each thread |
| // will execute one iteration of the loop. round up to the nearest |
| // integer |
| CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; |
| } else if (IsGenericMode) { |
| // If we reach this point, then we have a non-combined construct, i.e. |
| // `teams distribute` with a nested `parallel for` and each team is |
| // assigned one iteration of the `distribute` loop. E.g.: |
| // |
| // #pragma omp target teams distribute |
| // for(...loop_tripcount...) { |
| // #pragma omp parallel for |
| // for(...) {} |
| // } |
| // |
| // Threads within a team will execute the iterations of the `parallel` |
| // loop. |
| CudaBlocksPerGrid = LoopTripCount; |
| } else { |
| REPORT("Unknown execution mode: %d\n", |
| static_cast<int8_t>(KernelInfo->ExecutionMode)); |
| return OFFLOAD_FAIL; |
| } |
| DP("Using %d teams due to loop trip count %" PRIu32 |
| " and number of threads per block %d\n", |
| CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); |
| } else { |
| DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); |
| CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; |
| } |
| } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { |
| DP("Capping number of teams to team limit %d\n", |
| DeviceData[DeviceId].BlocksPerGrid); |
| CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; |
| } else { |
| DP("Using requested number of teams %d\n", TeamNum); |
| CudaBlocksPerGrid = TeamNum; |
| } |
| |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Launching kernel %s with %d blocks and %d threads in %s mode\n", |
| (getOffloadEntry(DeviceId, TgtEntryPtr)) |
| ? getOffloadEntry(DeviceId, TgtEntryPtr)->name |
| : "(null)", |
| CudaBlocksPerGrid, CudaThreadsPerBlock, |
| (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); |
| |
| CUstream Stream = getStream(DeviceId, AsyncInfo); |
| Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, |
| /* gridDimZ */ 1, CudaThreadsPerBlock, |
| /* blockDimY */ 1, /* blockDimZ */ 1, |
| DynamicMemorySize, Stream, &Args[0], nullptr); |
| if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) |
| return OFFLOAD_FAIL; |
| |
| DP("Launch of entry point at " DPxMOD " successful!\n", |
| DPxPTR(TgtEntryPtr)); |
| |
| return OFFLOAD_SUCCESS; |
| } |
| |
| int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { |
| CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); |
| CUresult Err = cuStreamSynchronize(Stream); |
| |
| // Once the stream is synchronized, return it to stream pool and reset |
| // AsyncInfo. This is to make sure the synchronization only works for its |
| // own tasks. |
| StreamManager->returnStream(DeviceId, |
| reinterpret_cast<CUstream>(AsyncInfo->Queue)); |
| AsyncInfo->Queue = nullptr; |
| |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when synchronizing stream. stream = " DPxMOD |
| ", async info ptr = " DPxMOD "\n", |
| DPxPTR(Stream), DPxPTR(AsyncInfo)); |
| CUDA_ERR_STRING(Err); |
| } |
| return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; |
| } |
| |
| void printDeviceInfo(int32_t device_id) { |
| char TmpChar[1000]; |
| std::string TmpStr; |
| size_t TmpSt; |
| int TmpInt, TmpInt2, TmpInt3; |
| |
| CUdevice Device; |
| checkResult(cuDeviceGet(&Device, device_id), |
| "Error returned from cuCtxGetDevice\n"); |
| |
| cuDriverGetVersion(&TmpInt); |
| printf(" CUDA Driver Version: \t\t%d \n", TmpInt); |
| printf(" CUDA Device Number: \t\t%d \n", device_id); |
| checkResult(cuDeviceGetName(TmpChar, 1000, Device), |
| "Error returned from cuDeviceGetName\n"); |
| printf(" Device Name: \t\t\t%s \n", TmpChar); |
| checkResult(cuDeviceTotalMem(&TmpSt, Device), |
| "Error returned from cuDeviceTotalMem\n"); |
| printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Registers per Block: \t\t%d \n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, |
| TmpInt3); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, |
| TmpInt3); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| if (TmpInt == CU_COMPUTEMODE_DEFAULT) |
| TmpStr = "DEFAULT"; |
| else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) |
| TmpStr = "PROHIBITED"; |
| else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) |
| TmpStr = "EXCLUSIVE PROCESS"; |
| else |
| TmpStr = "unknown"; |
| printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); |
| checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, |
| Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, |
| Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult(cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| checkResult( |
| cuDeviceGetAttribute( |
| &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), |
| "Error returned from cuDeviceGetAttribute\n"); |
| printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); |
| } |
| |
| int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, |
| void *EventPtr) const { |
| CUstream Stream = getStream(DeviceId, AsyncInfo); |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from |
| // specific CUDA version, and defined as 0x0. In previous version, per CUDA |
| // API document, that argument has to be 0x0. |
| CUresult Err = cuStreamWaitEvent(Stream, Event, 0); |
| if (Err != CUDA_SUCCESS) { |
| DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", |
| DPxPTR(Stream), DPxPTR(Event)); |
| CUDA_ERR_STRING(Err); |
| return OFFLOAD_FAIL; |
| } |
| |
| return OFFLOAD_SUCCESS; |
| } |
| }; |
| |
| DeviceRTLTy DeviceRTL; |
| } // namespace |
| |
| // Exposed library API function |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { |
| return elf_check_machine(image, /* EM_CUDA */ 190); |
| } |
| |
| int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } |
| |
| int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { |
| DP("Init requires flags to %" PRId64 "\n", RequiresFlags); |
| DeviceRTL.setRequiresFlag(RequiresFlags); |
| return RequiresFlags; |
| } |
| |
| int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { |
| if (DeviceRTL.isValidDeviceId(src_dev_id) && |
| DeviceRTL.isValidDeviceId(dst_dev_id)) |
| return 1; |
| |
| return 0; |
| } |
| |
| int32_t __tgt_rtl_init_device(int32_t device_id) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return DeviceRTL.initDevice(device_id); |
| } |
| |
| __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, |
| __tgt_device_image *image) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return DeviceRTL.loadBinary(device_id, image); |
| } |
| |
| void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, |
| int32_t kind) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); |
| } |
| |
| int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, |
| int64_t size) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| __tgt_async_info AsyncInfo; |
| const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, |
| size, &AsyncInfo); |
| if (rc != OFFLOAD_SUCCESS) |
| return OFFLOAD_FAIL; |
| |
| return __tgt_rtl_synchronize(device_id, &AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, |
| void *hst_ptr, int64_t size, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| assert(async_info_ptr && "async_info_ptr is nullptr"); |
| |
| return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, |
| async_info_ptr); |
| } |
| |
| int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, |
| int64_t size) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| __tgt_async_info AsyncInfo; |
| const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, |
| size, &AsyncInfo); |
| if (rc != OFFLOAD_SUCCESS) |
| return OFFLOAD_FAIL; |
| |
| return __tgt_rtl_synchronize(device_id, &AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, |
| void *tgt_ptr, int64_t size, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| assert(async_info_ptr && "async_info_ptr is nullptr"); |
| |
| return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, |
| async_info_ptr); |
| } |
| |
| int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, |
| int dst_dev_id, void *dst_ptr, |
| int64_t size, |
| __tgt_async_info *AsyncInfo) { |
| assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); |
| assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); |
| assert(AsyncInfo && "AsyncInfo is nullptr"); |
| |
| return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, |
| AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, |
| int32_t dst_dev_id, void *dst_ptr, |
| int64_t size) { |
| assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); |
| assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); |
| |
| __tgt_async_info AsyncInfo; |
| const int32_t rc = __tgt_rtl_data_exchange_async( |
| src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); |
| if (rc != OFFLOAD_SUCCESS) |
| return OFFLOAD_FAIL; |
| |
| return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return DeviceRTL.dataDelete(device_id, tgt_ptr); |
| } |
| |
| int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, |
| void **tgt_args, |
| ptrdiff_t *tgt_offsets, |
| int32_t arg_num, int32_t team_num, |
| int32_t thread_limit, |
| uint64_t loop_tripcount) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| __tgt_async_info AsyncInfo; |
| const int32_t rc = __tgt_rtl_run_target_team_region_async( |
| device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, |
| thread_limit, loop_tripcount, &AsyncInfo); |
| if (rc != OFFLOAD_SUCCESS) |
| return OFFLOAD_FAIL; |
| |
| return __tgt_rtl_synchronize(device_id, &AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_run_target_team_region_async( |
| int32_t device_id, void *tgt_entry_ptr, void **tgt_args, |
| ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, |
| int32_t thread_limit, uint64_t loop_tripcount, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return DeviceRTL.runTargetTeamRegion( |
| device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, |
| thread_limit, loop_tripcount, async_info_ptr); |
| } |
| |
| int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, |
| void **tgt_args, ptrdiff_t *tgt_offsets, |
| int32_t arg_num) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| __tgt_async_info AsyncInfo; |
| const int32_t rc = __tgt_rtl_run_target_region_async( |
| device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); |
| if (rc != OFFLOAD_SUCCESS) |
| return OFFLOAD_FAIL; |
| |
| return __tgt_rtl_synchronize(device_id, &AsyncInfo); |
| } |
| |
| int32_t __tgt_rtl_run_target_region_async(int32_t device_id, |
| void *tgt_entry_ptr, void **tgt_args, |
| ptrdiff_t *tgt_offsets, |
| int32_t arg_num, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| |
| return __tgt_rtl_run_target_team_region_async( |
| device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, |
| /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, |
| async_info_ptr); |
| } |
| |
| int32_t __tgt_rtl_synchronize(int32_t device_id, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| assert(async_info_ptr && "async_info_ptr is nullptr"); |
| assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); |
| |
| return DeviceRTL.synchronize(device_id, async_info_ptr); |
| } |
| |
| void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { |
| std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); |
| InfoLevel.store(NewInfoLevel); |
| } |
| |
| void __tgt_rtl_print_device_info(int32_t device_id) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| DeviceRTL.printDeviceInfo(device_id); |
| } |
| |
| int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { |
| assert(event && "event is nullptr"); |
| return createEvent(event); |
| } |
| |
| int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, |
| __tgt_async_info *async_info_ptr) { |
| assert(async_info_ptr && "async_info_ptr is nullptr"); |
| assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); |
| assert(event_ptr && "event_ptr is nullptr"); |
| |
| return recordEvent(event_ptr, async_info_ptr); |
| } |
| |
| int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, |
| __tgt_async_info *async_info_ptr) { |
| assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); |
| assert(async_info_ptr && "async_info_ptr is nullptr"); |
| assert(event_ptr && "event is nullptr"); |
| |
| return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); |
| } |
| |
| int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { |
| assert(event_ptr && "event is nullptr"); |
| |
| return syncEvent(event_ptr); |
| } |
| |
| int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { |
| assert(event_ptr && "event is nullptr"); |
| |
| return destroyEvent(event_ptr); |
| } |
| |
| #ifdef __cplusplus |
| } |
| #endif |