| //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // RTL NextGen for CUDA machine |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include <cassert> |
| #include <cstddef> |
| #include <cuda.h> |
| #include <string> |
| #include <unordered_map> |
| |
| #include "Shared/Debug.h" |
| #include "Shared/Environment.h" |
| |
| #include "GlobalHandler.h" |
| #include "OpenMP/OMPT/Callback.h" |
| #include "PluginInterface.h" |
| |
| #include "llvm/BinaryFormat/ELF.h" |
| #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| #include "llvm/Support/Error.h" |
| |
| namespace llvm { |
| namespace omp { |
| namespace target { |
| namespace plugin { |
| |
| /// Forward declarations for all specialized data structures. |
| struct CUDAKernelTy; |
| struct CUDADeviceTy; |
| struct CUDAPluginTy; |
| |
| #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000)) |
| /// Forward declarations for all Virtual Memory Management |
| /// related data structures and functions. This is necessary |
| /// for older cuda versions. |
| typedef void *CUmemGenericAllocationHandle; |
| typedef void *CUmemAllocationProp; |
| typedef void *CUmemAccessDesc; |
| typedef void *CUmemAllocationGranularity_flags; |
| CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, |
| CUdeviceptr addr, unsigned long long flags) {} |
| CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, |
| CUmemGenericAllocationHandle handle, |
| unsigned long long flags) {} |
| CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, |
| const CUmemAllocationProp *prop, |
| unsigned long long flags) {} |
| CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, |
| const CUmemAccessDesc *desc, size_t count) {} |
| CUresult |
| cuMemGetAllocationGranularity(size_t *granularity, |
| const CUmemAllocationProp *prop, |
| CUmemAllocationGranularity_flags option) {} |
| #endif |
| |
| /// Class implementing the CUDA device images properties. |
| struct CUDADeviceImageTy : public DeviceImageTy { |
| /// Create the CUDA image with the id and the target image pointer. |
| CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) |
| : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} |
| |
| /// Load the image as a CUDA module. |
| Error loadModule() { |
| assert(!Module && "Module already loaded"); |
| |
| CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr); |
| if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s")) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Unload the CUDA module corresponding to the image. |
| Error unloadModule() { |
| assert(Module && "Module not loaded"); |
| |
| CUresult Res = cuModuleUnload(Module); |
| if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s")) |
| return Err; |
| |
| Module = nullptr; |
| |
| return Plugin::success(); |
| } |
| |
| /// Getter of the CUDA module. |
| CUmodule getModule() const { return Module; } |
| |
| private: |
| /// The CUDA module that loaded the image. |
| CUmodule Module; |
| }; |
| |
| /// Class implementing the CUDA kernel functionalities which derives from the |
| /// generic kernel class. |
| struct CUDAKernelTy : public GenericKernelTy { |
| /// Create a CUDA kernel with a name and an execution mode. |
| CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {} |
| |
| /// Initialize the CUDA kernel. |
| Error initImpl(GenericDeviceTy &GenericDevice, |
| DeviceImageTy &Image) override { |
| CUresult Res; |
| CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image); |
| |
| // Retrieve the function pointer of the kernel. |
| Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName()); |
| if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s", |
| getName())) |
| return Err; |
| |
| // Check that the function pointer is valid. |
| if (!Func) |
| return Plugin::error("Invalid function for kernel %s", getName()); |
| |
| int MaxThreads; |
| Res = cuFuncGetAttribute(&MaxThreads, |
| CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); |
| if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) |
| return Err; |
| |
| // The maximum number of threads cannot exceed the maximum of the kernel. |
| MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); |
| |
| return Plugin::success(); |
| } |
| |
| /// Launch the CUDA kernel function. |
| Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, |
| uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) const override; |
| |
| private: |
| /// The CUDA kernel function to execute. |
| CUfunction Func; |
| }; |
| |
| /// Class wrapping a CUDA stream reference. These are the objects handled by the |
| /// Stream Manager for the CUDA plugin. |
| struct CUDAStreamRef final : public GenericDeviceResourceRef { |
| /// The underlying handle type for streams. |
| using HandleTy = CUstream; |
| |
| /// Create an empty reference to an invalid stream. |
| CUDAStreamRef() : Stream(nullptr) {} |
| |
| /// Create a reference to an existing stream. |
| CUDAStreamRef(HandleTy Stream) : Stream(Stream) {} |
| |
| /// Create a new stream and save the reference. The reference must be empty |
| /// before calling to this function. |
| Error create(GenericDeviceTy &Device) override { |
| if (Stream) |
| return Plugin::error("Creating an existing stream"); |
| |
| CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); |
| if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s")) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Destroy the referenced stream and invalidate the reference. The reference |
| /// must be to a valid stream before calling to this function. |
| Error destroy(GenericDeviceTy &Device) override { |
| if (!Stream) |
| return Plugin::error("Destroying an invalid stream"); |
| |
| CUresult Res = cuStreamDestroy(Stream); |
| if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s")) |
| return Err; |
| |
| Stream = nullptr; |
| return Plugin::success(); |
| } |
| |
| /// Get the underlying CUDA stream. |
| operator HandleTy() const { return Stream; } |
| |
| private: |
| /// The reference to the CUDA stream. |
| HandleTy Stream; |
| }; |
| |
| /// Class wrapping a CUDA event reference. These are the objects handled by the |
| /// Event Manager for the CUDA plugin. |
| struct CUDAEventRef final : public GenericDeviceResourceRef { |
| /// The underlying handle type for events. |
| using HandleTy = CUevent; |
| |
| /// Create an empty reference to an invalid event. |
| CUDAEventRef() : Event(nullptr) {} |
| |
| /// Create a reference to an existing event. |
| CUDAEventRef(HandleTy Event) : Event(Event) {} |
| |
| /// Create a new event and save the reference. The reference must be empty |
| /// before calling to this function. |
| Error create(GenericDeviceTy &Device) override { |
| if (Event) |
| return Plugin::error("Creating an existing event"); |
| |
| CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT); |
| if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s")) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Destroy the referenced event and invalidate the reference. The reference |
| /// must be to a valid event before calling to this function. |
| Error destroy(GenericDeviceTy &Device) override { |
| if (!Event) |
| return Plugin::error("Destroying an invalid event"); |
| |
| CUresult Res = cuEventDestroy(Event); |
| if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s")) |
| return Err; |
| |
| Event = nullptr; |
| return Plugin::success(); |
| } |
| |
| /// Get the underlying CUevent. |
| operator HandleTy() const { return Event; } |
| |
| private: |
| /// The reference to the CUDA event. |
| HandleTy Event; |
| }; |
| |
| /// Class implementing the CUDA device functionalities which derives from the |
| /// generic device class. |
| struct CUDADeviceTy : public GenericDeviceTy { |
| // Create a CUDA device with a device id and the default CUDA grid values. |
| CUDADeviceTy(int32_t DeviceId, int32_t NumDevices) |
| : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues), |
| CUDAStreamManager(*this), CUDAEventManager(*this) {} |
| |
| ~CUDADeviceTy() {} |
| |
| /// Initialize the device, its resources and get its properties. |
| Error initImpl(GenericPluginTy &Plugin) override { |
| CUresult Res = cuDeviceGet(&Device, DeviceId); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) |
| return Err; |
| |
| // Query the current flags of the primary context and set its flags if |
| // it is inactive. |
| unsigned int FormerPrimaryCtxFlags = 0; |
| int FormerPrimaryCtxIsActive = 0; |
| Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, |
| &FormerPrimaryCtxIsActive); |
| if (auto Err = |
| Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s")) |
| return Err; |
| |
| if (FormerPrimaryCtxIsActive) { |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "The primary context is active, no change to its flags\n"); |
| if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != |
| CU_CTX_SCHED_BLOCKING_SYNC) |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); |
| } else { |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "The primary context is inactive, set its flags to " |
| "CU_CTX_SCHED_BLOCKING_SYNC\n"); |
| Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); |
| if (auto Err = |
| Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s")) |
| return Err; |
| } |
| |
| // Retain the per device primary context and save it to use whenever this |
| // device is selected. |
| Res = cuDevicePrimaryCtxRetain(&Context, Device); |
| if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s")) |
| return Err; |
| |
| if (auto Err = setContext()) |
| return Err; |
| |
| // Initialize stream pool. |
| if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams)) |
| return Err; |
| |
| // Initialize event pool. |
| if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents)) |
| return Err; |
| |
| // Query attributes to determine number of threads/block and blocks/grid. |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, |
| GridValues.GV_Max_Teams)) |
| return Err; |
| |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, |
| GridValues.GV_Max_WG_Size)) |
| return Err; |
| |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, |
| GridValues.GV_Warp_Size)) |
| return Err; |
| |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, |
| ComputeCapability.Major)) |
| return Err; |
| |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, |
| ComputeCapability.Minor)) |
| return Err; |
| |
| uint32_t NumMuliprocessors = 0; |
| uint32_t MaxThreadsPerSM = 0; |
| uint32_t WarpSize = 0; |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, |
| NumMuliprocessors)) |
| return Err; |
| if (auto Err = |
| getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, |
| MaxThreadsPerSM)) |
| return Err; |
| if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize)) |
| return Err; |
| HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); |
| |
| return Plugin::success(); |
| } |
| |
| /// Deinitialize the device and release its resources. |
| Error deinitImpl() override { |
| if (Context) { |
| if (auto Err = setContext()) |
| return Err; |
| } |
| |
| // Deinitialize the stream manager. |
| if (auto Err = CUDAStreamManager.deinit()) |
| return Err; |
| |
| if (auto Err = CUDAEventManager.deinit()) |
| return Err; |
| |
| // Close modules if necessary. |
| if (!LoadedImages.empty()) { |
| assert(Context && "Invalid CUDA context"); |
| |
| // Each image has its own module. |
| for (DeviceImageTy *Image : LoadedImages) { |
| CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image); |
| |
| // Unload the module of the image. |
| if (auto Err = CUDAImage.unloadModule()) |
| return Err; |
| } |
| } |
| |
| if (Context) { |
| CUresult Res = cuDevicePrimaryCtxRelease(Device); |
| if (auto Err = |
| Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s")) |
| return Err; |
| } |
| |
| // Invalidate context and device references. |
| Context = nullptr; |
| Device = CU_DEVICE_INVALID; |
| |
| return Plugin::success(); |
| } |
| |
| virtual Error callGlobalConstructors(GenericPluginTy &Plugin, |
| DeviceImageTy &Image) override { |
| return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); |
| } |
| |
| virtual Error callGlobalDestructors(GenericPluginTy &Plugin, |
| DeviceImageTy &Image) override { |
| return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); |
| } |
| |
| /// Allocate and construct a CUDA kernel. |
| Expected<GenericKernelTy &> |
| constructKernel(const __tgt_offload_entry &KernelEntry) override { |
| // Allocate and construct the CUDA kernel. |
| CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>(); |
| if (!CUDAKernel) |
| return Plugin::error("Failed to allocate memory for CUDA kernel"); |
| |
| new (CUDAKernel) CUDAKernelTy(KernelEntry.name); |
| |
| return *CUDAKernel; |
| } |
| |
| /// Set the current context to this device's context. |
| Error setContext() override { |
| CUresult Res = cuCtxSetCurrent(Context); |
| return Plugin::check(Res, "Error in cuCtxSetCurrent: %s"); |
| } |
| |
| /// NVIDIA returns the product of the SM count and the number of warps that |
| /// fit if the maximum number of threads were scheduled on each SM. |
| uint64_t getHardwareParallelism() const override { |
| return HardwareParallelism; |
| } |
| |
| /// We want to set up the RPC server for host services to the GPU if it is |
| /// availible. |
| bool shouldSetupRPCServer() const override { |
| return libomptargetSupportsRPC(); |
| } |
| |
| /// The RPC interface should have enough space for all availible parallelism. |
| uint64_t requestedRPCPortCount() const override { |
| return getHardwareParallelism(); |
| } |
| |
| /// Get the stream of the asynchronous info sructure or get a new one. |
| Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) { |
| // Get the stream (if any) from the async info. |
| Stream = AsyncInfoWrapper.getQueueAs<CUstream>(); |
| if (!Stream) { |
| // There was no stream; get an idle one. |
| if (auto Err = CUDAStreamManager.getResource(Stream)) |
| return Err; |
| |
| // Modify the async info's stream. |
| AsyncInfoWrapper.setQueueAs<CUstream>(Stream); |
| } |
| return Plugin::success(); |
| } |
| |
| /// Getters of CUDA references. |
| CUcontext getCUDAContext() const { return Context; } |
| CUdevice getCUDADevice() const { return Device; } |
| |
| /// Load the binary image into the device and allocate an image object. |
| Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage, |
| int32_t ImageId) override { |
| if (auto Err = setContext()) |
| return std::move(Err); |
| |
| // Allocate and initialize the image object. |
| CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>(); |
| new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage); |
| |
| // Load the CUDA module. |
| if (auto Err = CUDAImage->loadModule()) |
| return std::move(Err); |
| |
| return CUDAImage; |
| } |
| |
| /// Allocate memory on the device or related to the device. |
| void *allocate(size_t Size, void *, TargetAllocTy Kind) override { |
| if (Size == 0) |
| return nullptr; |
| |
| if (auto Err = setContext()) { |
| REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); |
| return nullptr; |
| } |
| |
| void *MemAlloc = nullptr; |
| CUdeviceptr DevicePtr; |
| CUresult Res; |
| |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| Res = cuMemAlloc(&DevicePtr, Size); |
| MemAlloc = (void *)DevicePtr; |
| break; |
| case TARGET_ALLOC_HOST: |
| Res = cuMemAllocHost(&MemAlloc, Size); |
| break; |
| case TARGET_ALLOC_SHARED: |
| Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); |
| MemAlloc = (void *)DevicePtr; |
| break; |
| } |
| |
| if (auto Err = |
| Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) { |
| REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); |
| return nullptr; |
| } |
| return MemAlloc; |
| } |
| |
| /// Deallocate memory on the device or related to the device. |
| int free(void *TgtPtr, TargetAllocTy Kind) override { |
| if (TgtPtr == nullptr) |
| return OFFLOAD_SUCCESS; |
| |
| if (auto Err = setContext()) { |
| REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); |
| return OFFLOAD_FAIL; |
| } |
| |
| CUresult Res; |
| switch (Kind) { |
| case TARGET_ALLOC_DEFAULT: |
| case TARGET_ALLOC_DEVICE: |
| case TARGET_ALLOC_SHARED: |
| Res = cuMemFree((CUdeviceptr)TgtPtr); |
| break; |
| case TARGET_ALLOC_HOST: |
| Res = cuMemFreeHost(TgtPtr); |
| break; |
| } |
| |
| if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) { |
| REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); |
| return OFFLOAD_FAIL; |
| } |
| return OFFLOAD_SUCCESS; |
| } |
| |
| /// Synchronize current thread with the pending operations on the async info. |
| Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { |
| CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue); |
| CUresult Res; |
| // If we have an RPC server running on this device we will continuously |
| // query it for work rather than blocking. |
| if (!getRPCServer()) { |
| Res = cuStreamSynchronize(Stream); |
| } else { |
| do { |
| Res = cuStreamQuery(Stream); |
| if (auto Err = getRPCServer()->runServer(*this)) |
| return Err; |
| } while (Res == CUDA_ERROR_NOT_READY); |
| } |
| |
| // Once the stream is synchronized, return it to stream pool and reset |
| // AsyncInfo. This is to make sure the synchronization only works for its |
| // own tasks. |
| AsyncInfo.Queue = nullptr; |
| if (auto Err = CUDAStreamManager.returnResource(Stream)) |
| return Err; |
| |
| return Plugin::check(Res, "Error in cuStreamSynchronize: %s"); |
| } |
| |
| /// CUDA support VA management |
| bool supportVAManagement() const override { |
| #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000)) |
| return true; |
| #else |
| return false; |
| #endif |
| } |
| |
| /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda |
| /// driver to map it to \p VAddr. The obtained address is stored in \p Addr. |
| /// At return \p RSize contains the actual size |
| Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override { |
| CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr); |
| auto IHandle = DeviceMMaps.find(DVAddr); |
| size_t Size = *RSize; |
| |
| if (Size == 0) |
| return Plugin::error("Memory Map Size must be larger than 0"); |
| |
| // Check if we have already mapped this address |
| if (IHandle != DeviceMMaps.end()) |
| return Plugin::error("Address already memory mapped"); |
| |
| CUmemAllocationProp Prop = {}; |
| size_t Granularity = 0; |
| |
| size_t Free, Total; |
| CUresult Res = cuMemGetInfo(&Free, &Total); |
| if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s")) |
| return Err; |
| |
| if (Size >= Free) { |
| *Addr = nullptr; |
| return Plugin::error( |
| "Canot map memory size larger than the available device memory"); |
| } |
| |
| // currently NVidia only supports pinned device types |
| Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; |
| Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; |
| |
| Prop.location.id = DeviceId; |
| cuMemGetAllocationGranularity(&Granularity, &Prop, |
| CU_MEM_ALLOC_GRANULARITY_MINIMUM); |
| if (auto Err = |
| Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s")) |
| return Err; |
| |
| if (Granularity == 0) |
| return Plugin::error("Wrong device Page size"); |
| |
| // Ceil to page size. |
| Size = roundUp(Size, Granularity); |
| |
| // Create a handler of our allocation |
| CUmemGenericAllocationHandle AHandle; |
| Res = cuMemCreate(&AHandle, Size, &Prop, 0); |
| if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s")) |
| return Err; |
| |
| CUdeviceptr DevPtr = 0; |
| Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0); |
| if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s")) |
| return Err; |
| |
| Res = cuMemMap(DevPtr, Size, 0, AHandle, 0); |
| if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s")) |
| return Err; |
| |
| CUmemAccessDesc ADesc = {}; |
| ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; |
| ADesc.location.id = DeviceId; |
| ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; |
| |
| // Sets address |
| Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1); |
| if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s")) |
| return Err; |
| |
| *Addr = reinterpret_cast<void *>(DevPtr); |
| *RSize = Size; |
| DeviceMMaps.insert({DevPtr, AHandle}); |
| return Plugin::success(); |
| } |
| |
| /// De-allocates device memory and Unmaps the Virtual Addr |
| Error memoryVAUnMap(void *VAddr, size_t Size) override { |
| CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr); |
| auto IHandle = DeviceMMaps.find(DVAddr); |
| // Mapping does not exist |
| if (IHandle == DeviceMMaps.end()) { |
| return Plugin::error("Addr is not MemoryMapped"); |
| } |
| |
| if (IHandle == DeviceMMaps.end()) |
| return Plugin::error("Addr is not MemoryMapped"); |
| |
| CUmemGenericAllocationHandle &AllocHandle = IHandle->second; |
| |
| CUresult Res = cuMemUnmap(DVAddr, Size); |
| if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s")) |
| return Err; |
| |
| Res = cuMemRelease(AllocHandle); |
| if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s")) |
| return Err; |
| |
| Res = cuMemAddressFree(DVAddr, Size); |
| if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s")) |
| return Err; |
| |
| DeviceMMaps.erase(IHandle); |
| return Plugin::success(); |
| } |
| |
| /// Query for the completion of the pending operations on the async info. |
| Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override { |
| CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue); |
| CUresult Res = cuStreamQuery(Stream); |
| |
| // Not ready streams must be considered as successful operations. |
| if (Res == CUDA_ERROR_NOT_READY) |
| return Plugin::success(); |
| |
| // Once the stream is synchronized and the operations completed (or an error |
| // occurs), return it to stream pool and reset AsyncInfo. This is to make |
| // sure the synchronization only works for its own tasks. |
| AsyncInfo.Queue = nullptr; |
| if (auto Err = CUDAStreamManager.returnResource(Stream)) |
| return Err; |
| |
| return Plugin::check(Res, "Error in cuStreamQuery: %s"); |
| } |
| |
| Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override { |
| // TODO: Register the buffer as CUDA host memory. |
| return HstPtr; |
| } |
| |
| Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } |
| |
| Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr, |
| void *&BaseDevAccessiblePtr, |
| size_t &BaseSize) const override { |
| // TODO: Implement pinning feature for CUDA. |
| return false; |
| } |
| |
| /// Submit data to the device (host to device transfer). |
| Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| if (auto Err = setContext()) |
| return Err; |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); |
| return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s"); |
| } |
| |
| /// Retrieve data from the device (device to host transfer). |
| Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| if (auto Err = setContext()) |
| return Err; |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| // If there is already pending work on the stream it could be waiting for |
| // someone to check the RPC server. |
| if (auto RPCServer = getRPCServer()) { |
| CUresult Res = cuStreamQuery(Stream); |
| while (Res == CUDA_ERROR_NOT_READY) { |
| if (auto Err = RPCServer->runServer(*this)) |
| return Err; |
| Res = cuStreamQuery(Stream); |
| } |
| } |
| |
| CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); |
| return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s"); |
| } |
| |
| /// Exchange data between two devices directly. We may use peer access if |
| /// the CUDA devices and driver allow them. |
| Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, |
| void *DstPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override; |
| |
| /// Initialize the async info for interoperability purposes. |
| Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| if (auto Err = setContext()) |
| return Err; |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| /// Initialize the device info for interoperability purposes. |
| Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { |
| assert(Context && "Context is null"); |
| assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device"); |
| |
| if (auto Err = setContext()) |
| return Err; |
| |
| if (!DeviceInfo->Context) |
| DeviceInfo->Context = Context; |
| |
| if (!DeviceInfo->Device) |
| DeviceInfo->Device = reinterpret_cast<void *>(Device); |
| |
| return Plugin::success(); |
| } |
| |
| /// Create an event. |
| Error createEventImpl(void **EventPtrStorage) override { |
| CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage); |
| return CUDAEventManager.getResource(*Event); |
| } |
| |
| /// Destroy a previously created event. |
| Error destroyEventImpl(void *EventPtr) override { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| return CUDAEventManager.returnResource(Event); |
| } |
| |
| /// Record the event. |
| Error recordEventImpl(void *EventPtr, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| CUresult Res = cuEventRecord(Event, Stream); |
| return Plugin::check(Res, "Error in cuEventRecord: %s"); |
| } |
| |
| /// Make the stream wait on the event. |
| Error waitEventImpl(void *EventPtr, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) override { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from |
| // specific CUDA version, and defined as 0x0. In previous version, per CUDA |
| // API document, that argument has to be 0x0. |
| CUresult Res = cuStreamWaitEvent(Stream, Event, 0); |
| return Plugin::check(Res, "Error in cuStreamWaitEvent: %s"); |
| } |
| |
| /// Synchronize the current thread with the event. |
| Error syncEventImpl(void *EventPtr) override { |
| CUevent Event = reinterpret_cast<CUevent>(EventPtr); |
| CUresult Res = cuEventSynchronize(Event); |
| return Plugin::check(Res, "Error in cuEventSynchronize: %s"); |
| } |
| |
| /// Print information about the device. |
| Error obtainInfoImpl(InfoQueueTy &Info) override { |
| char TmpChar[1000]; |
| const char *TmpCharPtr; |
| size_t TmpSt; |
| int TmpInt; |
| |
| CUresult Res = cuDriverGetVersion(&TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("CUDA Driver Version", TmpInt); |
| |
| Info.add("CUDA OpenMP Device Number", DeviceId); |
| |
| Res = cuDeviceGetName(TmpChar, 1000, Device); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Device Name", TmpChar); |
| |
| Res = cuDeviceTotalMem(&TmpSt, Device); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Global Memory Size", TmpSt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Number of Multiprocessors", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Concurrent Copy and Execution", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Total Constant Memory", TmpInt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, |
| TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Max Shared Memory per Block", TmpInt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Registers per Block", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Warp Size", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Maximum Threads per Block", TmpInt); |
| |
| Info.add("Maximum Block Dimensions", ""); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("x", TmpInt); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("y", TmpInt); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("z", TmpInt); |
| |
| Info.add("Maximum Grid Dimensions", ""); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("x", TmpInt); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("y", TmpInt); |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add<InfoLevel2>("z", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Maximum Memory Pitch", TmpInt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Texture Alignment", TmpInt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Clock Rate", TmpInt, "kHz"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Execution Timeout", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Integrated Device", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Can Map Host Memory", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt); |
| if (Res == CUDA_SUCCESS) { |
| if (TmpInt == CU_COMPUTEMODE_DEFAULT) |
| TmpCharPtr = "Default"; |
| else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) |
| TmpCharPtr = "Prohibited"; |
| else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) |
| TmpCharPtr = "Exclusive process"; |
| else |
| TmpCharPtr = "Unknown"; |
| Info.add("Compute Mode", TmpCharPtr); |
| } |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Concurrent Kernels", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("ECC Enabled", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Memory Clock Rate", TmpInt, "kHz"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Memory Bus Width", TmpInt, "bits"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("L2 Cache Size", TmpInt, "bytes"); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, |
| TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Max Threads Per SMP", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Async Engines", TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Unified Addressing", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Managed Memory", (bool)TmpInt); |
| |
| Res = |
| getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Concurrent Managed Memory", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, |
| TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Preemption Supported", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Cooperative Launch", (bool)TmpInt); |
| |
| Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt); |
| if (Res == CUDA_SUCCESS) |
| Info.add("Multi-Device Boars", (bool)TmpInt); |
| |
| Info.add("Compute Capabilities", ComputeCapability.str()); |
| |
| return Plugin::success(); |
| } |
| |
| virtual bool shouldSetupDeviceMemoryPool() const override { |
| /// We use the CUDA malloc for now. |
| return false; |
| } |
| |
| /// Getters and setters for stack and heap sizes. |
| Error getDeviceStackSize(uint64_t &Value) override { |
| return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); |
| } |
| Error setDeviceStackSize(uint64_t Value) override { |
| return setCtxLimit(CU_LIMIT_STACK_SIZE, Value); |
| } |
| Error getDeviceHeapSize(uint64_t &Value) override { |
| return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); |
| } |
| Error setDeviceHeapSize(uint64_t Value) override { |
| return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); |
| } |
| Error getDeviceMemorySize(uint64_t &Value) override { |
| CUresult Res = cuDeviceTotalMem(&Value, Device); |
| return Plugin::check(Res, "Error in getDeviceMemorySize %s"); |
| } |
| |
| /// CUDA-specific functions for getting and setting context limits. |
| Error setCtxLimit(CUlimit Kind, uint64_t Value) { |
| CUresult Res = cuCtxSetLimit(Kind, Value); |
| return Plugin::check(Res, "Error in cuCtxSetLimit: %s"); |
| } |
| Error getCtxLimit(CUlimit Kind, uint64_t &Value) { |
| CUresult Res = cuCtxGetLimit(&Value, Kind); |
| return Plugin::check(Res, "Error in cuCtxGetLimit: %s"); |
| } |
| |
| /// CUDA-specific function to get device attributes. |
| Error getDeviceAttr(uint32_t Kind, uint32_t &Value) { |
| // TODO: Warn if the new value is larger than the old. |
| CUresult Res = |
| cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device); |
| return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"); |
| } |
| |
| CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) { |
| return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device); |
| } |
| |
| /// See GenericDeviceTy::getComputeUnitKind(). |
| std::string getComputeUnitKind() const override { |
| return ComputeCapability.str(); |
| } |
| |
| /// Returns the clock frequency for the given NVPTX device. |
| uint64_t getClockFrequency() const override { return 1000000000; } |
| |
| private: |
| using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>; |
| using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>; |
| |
| Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, |
| bool IsCtor) { |
| const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini"; |
| // Perform a quick check for the named kernel in the image. The kernel |
| // should be created by the 'nvptx-lower-ctor-dtor' pass. |
| GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); |
| if (!Handler.isSymbolInImage(*this, Image, KernelName)) |
| return Plugin::success(); |
| |
| // The Nvidia backend cannot handle creating the ctor / dtor array |
| // automatically so we must create it ourselves. The backend will emit |
| // several globals that contain function pointers we can call. These are |
| // prefixed with a known name due to Nvidia's lack of section support. |
| const ELF64LEObjectFile *ELFObj = |
| Handler.getOrCreateELFObjectFile(*this, Image); |
| if (!ELFObj) |
| return Plugin::error("Unable to create ELF object for image %p", |
| Image.getStart()); |
| |
| // Search for all symbols that contain a constructor or destructor. |
| SmallVector<std::pair<StringRef, uint16_t>> Funcs; |
| for (ELFSymbolRef Sym : ELFObj->symbols()) { |
| auto NameOrErr = Sym.getName(); |
| if (!NameOrErr) |
| return NameOrErr.takeError(); |
| |
| if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" |
| : "__fini_array_object_")) |
| continue; |
| |
| uint16_t Priority; |
| if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority)) |
| return Plugin::error("Invalid priority for constructor or destructor"); |
| |
| Funcs.emplace_back(*NameOrErr, Priority); |
| } |
| |
| // Sort the created array to be in priority order. |
| llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; }); |
| |
| // Allocate a buffer to store all of the known constructor / destructor |
| // functions in so we can iterate them on the device. |
| void *Buffer = |
| allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE); |
| if (!Buffer) |
| return Plugin::error("Failed to allocate memory for global buffer"); |
| |
| auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer); |
| auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size(); |
| |
| SmallVector<void *> FunctionPtrs(Funcs.size()); |
| std::size_t Idx = 0; |
| for (auto [Name, Priority] : Funcs) { |
| GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]); |
| if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) |
| return Err; |
| } |
| |
| // Copy the local buffer to the device. |
| if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(), |
| FunctionPtrs.size() * sizeof(void *), nullptr)) |
| return Err; |
| |
| // Copy the created buffer to the appropriate symbols so the kernel can |
| // iterate through them. |
| GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", |
| sizeof(void *), &GlobalPtrStart); |
| if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) |
| return Err; |
| |
| GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", |
| sizeof(void *), &GlobalPtrStop); |
| if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) |
| return Err; |
| |
| CUDAKernelTy CUDAKernel(KernelName); |
| |
| if (auto Err = CUDAKernel.init(*this, Image)) |
| return Err; |
| |
| AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); |
| |
| KernelArgsTy KernelArgs = {}; |
| if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u, |
| /*NumBlocks=*/1ul, KernelArgs, nullptr, |
| AsyncInfoWrapper)) |
| return Err; |
| |
| Error Err = Plugin::success(); |
| AsyncInfoWrapper.finalize(Err); |
| |
| if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS) |
| return Plugin::error("Failed to free memory for global buffer"); |
| |
| return Err; |
| } |
| |
| /// Stream manager for CUDA streams. |
| CUDAStreamManagerTy CUDAStreamManager; |
| |
| /// Event manager for CUDA events. |
| CUDAEventManagerTy CUDAEventManager; |
| |
| /// The device's context. This context should be set before performing |
| /// operations on the device. |
| CUcontext Context = nullptr; |
| |
| /// The CUDA device handler. |
| CUdevice Device = CU_DEVICE_INVALID; |
| |
| /// The memory mapped addresses and their handles |
| std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps; |
| |
| /// The compute capability of the corresponding CUDA device. |
| struct ComputeCapabilityTy { |
| uint32_t Major; |
| uint32_t Minor; |
| std::string str() const { |
| return "sm_" + std::to_string(Major * 10 + Minor); |
| } |
| } ComputeCapability; |
| |
| /// The maximum number of warps that can be resident on all the SMs |
| /// simultaneously. |
| uint32_t HardwareParallelism = 0; |
| }; |
| |
| Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
| uint32_t NumThreads, uint64_t NumBlocks, |
| KernelArgsTy &KernelArgs, void *Args, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
| CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice); |
| |
| CUstream Stream; |
| if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| uint32_t MaxDynCGroupMem = |
| std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); |
| |
| CUresult Res = |
| cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1, |
| /* gridDimZ */ 1, NumThreads, |
| /* blockDimY */ 1, /* blockDimZ */ 1, MaxDynCGroupMem, |
| Stream, (void **)Args, nullptr); |
| return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); |
| } |
| |
| /// Class implementing the CUDA-specific functionalities of the global handler. |
| class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy { |
| public: |
| /// Get the metadata of a global from the device. The name and size of the |
| /// global is read from DeviceGlobal and the address of the global is written |
| /// to DeviceGlobal. |
| Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, |
| DeviceImageTy &Image, |
| GlobalTy &DeviceGlobal) override { |
| CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image); |
| |
| const char *GlobalName = DeviceGlobal.getName().data(); |
| |
| size_t CUSize; |
| CUdeviceptr CUPtr; |
| CUresult Res = |
| cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName); |
| if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s", |
| GlobalName)) |
| return Err; |
| |
| if (CUSize != DeviceGlobal.getSize()) |
| return Plugin::error( |
| "Failed to load global '%s' due to size mismatch (%zu != %zu)", |
| GlobalName, CUSize, (size_t)DeviceGlobal.getSize()); |
| |
| DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr)); |
| return Plugin::success(); |
| } |
| }; |
| |
| /// Class implementing the CUDA-specific functionalities of the plugin. |
| struct CUDAPluginTy final : public GenericPluginTy { |
| /// Create a CUDA plugin. |
| CUDAPluginTy() : GenericPluginTy(getTripleArch()) {} |
| |
| /// This class should not be copied. |
| CUDAPluginTy(const CUDAPluginTy &) = delete; |
| CUDAPluginTy(CUDAPluginTy &&) = delete; |
| |
| /// Initialize the plugin and return the number of devices. |
| Expected<int32_t> initImpl() override { |
| CUresult Res = cuInit(0); |
| if (Res == CUDA_ERROR_INVALID_HANDLE) { |
| // Cannot call cuGetErrorString if dlsym failed. |
| DP("Failed to load CUDA shared library\n"); |
| return 0; |
| } |
| |
| #ifdef OMPT_SUPPORT |
| ompt::connectLibrary(); |
| #endif |
| |
| if (Res == CUDA_ERROR_NO_DEVICE) { |
| // Do not initialize if there are no devices. |
| DP("There are no devices supporting CUDA.\n"); |
| return 0; |
| } |
| |
| if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) |
| return std::move(Err); |
| |
| // Get the number of devices. |
| int NumDevices; |
| Res = cuDeviceGetCount(&NumDevices); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) |
| return std::move(Err); |
| |
| // Do not initialize if there are no devices. |
| if (NumDevices == 0) |
| DP("There are no devices supporting CUDA.\n"); |
| |
| return NumDevices; |
| } |
| |
| /// Deinitialize the plugin. |
| Error deinitImpl() override { return Plugin::success(); } |
| |
| /// Get the ELF code for recognizing the compatible image binary. |
| uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; } |
| |
| Triple::ArchType getTripleArch() const override { |
| // TODO: I think we can drop the support for 32-bit NVPTX devices. |
| return Triple::nvptx64; |
| } |
| |
| /// Check whether the image is compatible with the available CUDA devices. |
| Expected<bool> isImageCompatible(__tgt_image_info *Info) const override { |
| for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) { |
| CUdevice Device; |
| CUresult Res = cuDeviceGet(&Device, DevId); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) |
| return std::move(Err); |
| |
| int32_t Major, Minor; |
| Res = cuDeviceGetAttribute( |
| &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) |
| return std::move(Err); |
| |
| Res = cuDeviceGetAttribute( |
| &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) |
| return std::move(Err); |
| |
| StringRef ArchStr(Info->Arch); |
| StringRef PrefixStr("sm_"); |
| if (!ArchStr.starts_with(PrefixStr)) |
| return Plugin::error("Unrecognized image arch %s", ArchStr.data()); |
| |
| int32_t ImageMajor = ArchStr[PrefixStr.size() + 0] - '0'; |
| int32_t ImageMinor = ArchStr[PrefixStr.size() + 1] - '0'; |
| |
| // A cubin generated for a certain compute capability is supported to run |
| // on any GPU with the same major revision and same or higher minor |
| // revision. |
| if (Major != ImageMajor || Minor < ImageMinor) |
| return false; |
| } |
| return true; |
| } |
| }; |
| |
| Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, |
| GenericDeviceTy &DstGenericDevice, |
| void *DstPtr, int64_t Size, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) { |
| if (auto Err = setContext()) |
| return Err; |
| |
| CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice); |
| |
| CUresult Res; |
| int32_t DstDeviceId = DstDevice.DeviceId; |
| CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr; |
| CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr; |
| |
| int CanAccessPeer = 0; |
| if (DeviceId != DstDeviceId) { |
| // Make sure the lock is released before performing the copies. |
| std::lock_guard<std::mutex> Lock(PeerAccessesLock); |
| |
| switch (PeerAccesses[DstDeviceId]) { |
| case PeerAccessState::AVAILABLE: |
| CanAccessPeer = 1; |
| break; |
| case PeerAccessState::UNAVAILABLE: |
| CanAccessPeer = 0; |
| break; |
| case PeerAccessState::PENDING: |
| // Check whether the source device can access the destination device. |
| Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device); |
| if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s")) |
| return Err; |
| |
| if (CanAccessPeer) { |
| Res = cuCtxEnablePeerAccess(DstDevice.Context, 0); |
| if (Res == CUDA_ERROR_TOO_MANY_PEERS) { |
| // Resources may be exhausted due to many P2P links. |
| CanAccessPeer = 0; |
| DP("Too many P2P so fall back to D2D memcpy"); |
| } else if (auto Err = |
| Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s")) |
| return Err; |
| } |
| PeerAccesses[DstDeviceId] = (CanAccessPeer) |
| ? PeerAccessState::AVAILABLE |
| : PeerAccessState::UNAVAILABLE; |
| } |
| } |
| |
| CUstream Stream; |
| if (auto Err = getStream(AsyncInfoWrapper, Stream)) |
| return Err; |
| |
| if (CanAccessPeer) { |
| // TODO: Should we fallback to D2D if peer access fails? |
| Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context, |
| Size, Stream); |
| return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s"); |
| } |
| |
| // Fallback to D2D copy. |
| Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream); |
| return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); |
| } |
| |
| GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); } |
| |
| GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { |
| return new CUDADeviceTy(DeviceId, NumDevices); |
| } |
| |
| GenericGlobalHandlerTy *Plugin::createGlobalHandler() { |
| return new CUDAGlobalHandlerTy(); |
| } |
| |
| template <typename... ArgsTy> |
| Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { |
| CUresult ResultCode = static_cast<CUresult>(Code); |
| if (ResultCode == CUDA_SUCCESS) |
| return Error::success(); |
| |
| const char *Desc = "Unknown error"; |
| CUresult Ret = cuGetErrorString(ResultCode, &Desc); |
| if (Ret != CUDA_SUCCESS) |
| REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); |
| |
| return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(), |
| ErrFmt, Args..., Desc); |
| } |
| |
| } // namespace plugin |
| } // namespace target |
| } // namespace omp |
| } // namespace llvm |