blob: 25c80ee6021b66d7ec8938efc7b50a95a2ab07ca [file] [log] [blame]
//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// RTL for CUDA machine
//
//===----------------------------------------------------------------------===//
#include <cassert>
#include <cstddef>
#include <cuda.h>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include "Debug.h"
#include "omptargetplugin.h"
#define TARGET_NAME CUDA
#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
#include "MemoryManager.h"
// Utility for retrieving and printing CUDA error string.
#ifdef OMPTARGET_DEBUG
#define CUDA_ERR_STRING(err) \
do { \
if (getDebugLevel() > 0) { \
const char *errStr = nullptr; \
CUresult errStr_status = cuGetErrorString(err, &errStr); \
if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
REPORT("Unrecognized CUDA error code: %d\n", err); \
else if (errStr_status == CUDA_SUCCESS) \
REPORT("CUDA error is: %s\n", errStr); \
else { \
REPORT("Unresolved CUDA error code: %d\n", err); \
REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
errStr_status); \
} \
} else { \
const char *errStr = nullptr; \
CUresult errStr_status = cuGetErrorString(err, &errStr); \
if (errStr_status == CUDA_SUCCESS) \
REPORT("%s \n", errStr); \
} \
} while (false)
#else // OMPTARGET_DEBUG
#define CUDA_ERR_STRING(err) \
do { \
const char *errStr = nullptr; \
CUresult errStr_status = cuGetErrorString(err, &errStr); \
if (errStr_status == CUDA_SUCCESS) \
REPORT("%s \n", errStr); \
} while (false)
#endif // OMPTARGET_DEBUG
#include "elf_common.h"
/// Keep entries table per device.
struct FuncOrGblEntryTy {
__tgt_target_table Table;
std::vector<__tgt_offload_entry> Entries;
};
enum ExecutionModeType {
SPMD, // constructors, destructors,
// combined constructs (`teams distribute parallel for [simd]`)
GENERIC, // everything else
NONE
};
/// Use a single entity to encode a kernel and a set of flags.
struct KernelTy {
CUfunction Func;
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
int8_t ExecutionMode;
/// Maximal number of threads per block for this kernel.
int MaxThreadsPerBlock = 0;
KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}
};
/// Device environment data
/// Manually sync with the deviceRTL side for now, move to a dedicated header
/// file later.
struct omptarget_device_environmentTy {
int32_t debug_level;
};
namespace {
bool checkResult(CUresult Err, const char *ErrMsg) {
if (Err == CUDA_SUCCESS)
return true;
REPORT("%s", ErrMsg);
CUDA_ERR_STRING(Err);
return false;
}
int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
CUstream Stream) {
CUresult Err =
cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
if (Err != CUDA_SUCCESS) {
DP("Error when copying data from device to device. Pointers: src "
"= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
// Structure contains per-device data
struct DeviceDataTy {
/// List that contains all the kernels.
std::list<KernelTy> KernelsList;
std::list<FuncOrGblEntryTy> FuncGblEntries;
CUcontext Context = nullptr;
// Device properties
int ThreadsPerBlock = 0;
int BlocksPerGrid = 0;
int WarpSize = 0;
// OpenMP properties
int NumTeams = 0;
int NumThreads = 0;
};
class StreamManagerTy {
int NumberOfDevices;
// The initial size of stream pool
int EnvNumInitialStreams;
// Per-device stream mutex
std::vector<std::unique_ptr<std::mutex>> StreamMtx;
// Per-device stream Id indicates the next available stream in the pool
std::vector<int> NextStreamId;
// Per-device stream pool
std::vector<std::vector<CUstream>> StreamPool;
// Reference to per-device data
std::vector<DeviceDataTy> &DeviceData;
// If there is no CUstream left in the pool, we will resize the pool to
// allocate more CUstream. This function should be called with device mutex,
// and we do not resize to smaller one.
void resizeStreamPool(const int DeviceId, const size_t NewSize) {
std::vector<CUstream> &Pool = StreamPool[DeviceId];
const size_t CurrentSize = Pool.size();
assert(NewSize > CurrentSize && "new size is not larger than current size");
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
// We will return if cannot switch to the right context in case of
// creating bunch of streams that are not corresponding to the right
// device. The offloading will fail later because selected CUstream is
// nullptr.
return;
}
Pool.resize(NewSize, nullptr);
for (size_t I = CurrentSize; I < NewSize; ++I) {
checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
"Error returned from cuStreamCreate\n");
}
}
public:
StreamManagerTy(const int NumberOfDevices,
std::vector<DeviceDataTy> &DeviceData)
: NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
DeviceData(DeviceData) {
StreamPool.resize(NumberOfDevices);
NextStreamId.resize(NumberOfDevices);
StreamMtx.resize(NumberOfDevices);
if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
EnvNumInitialStreams = std::stoi(EnvStr);
// Initialize the next stream id
std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
// Initialize stream mutex
for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
Ptr = std::make_unique<std::mutex>();
}
~StreamManagerTy() {
// Destroy streams
for (int I = 0; I < NumberOfDevices; ++I) {
checkResult(cuCtxSetCurrent(DeviceData[I].Context),
"Error returned from cuCtxSetCurrent\n");
for (CUstream &S : StreamPool[I]) {
if (S)
checkResult(cuStreamDestroy(S),
"Error returned from cuStreamDestroy\n");
}
}
}
// Get a CUstream from pool. Per-device next stream id always points to the
// next available CUstream. That means, CUstreams [0, id-1] have been
// assigned, and [id,] are still available. If there is no CUstream left, we
// will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
// the id will increase one.
// xxxxxs+++++++++
// ^
// id
// After assignment, the pool becomes the following and s is assigned.
// xxxxxs+++++++++
// ^
// id
CUstream getStream(const int DeviceId) {
const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];
// No CUstream left in the pool, we need to request from CUDA RT
if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
// By default we double the stream pool every time
resizeStreamPool(DeviceId, Id * 2);
}
return StreamPool[DeviceId][Id++];
}
// Return a CUstream back to pool. As mentioned above, per-device next
// stream is always points to the next available CUstream, so when we return
// a CUstream, we need to first decrease the id, and then copy the CUstream
// back.
// It is worth noting that, the order of streams return might be different
// from that they're assigned, that saying, at some point, there might be
// two identical CUstreams.
// xxax+a+++++
// ^
// id
// However, it doesn't matter, because they're always on the two sides of
// id. The left one will in the end be overwritten by another CUstream.
// Therefore, after several execution, the order of pool might be different
// from its initial state.
void returnStream(const int DeviceId, CUstream Stream) {
const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
int &Id = NextStreamId[DeviceId];
assert(Id > 0 && "Wrong stream ID");
StreamPool[DeviceId][--Id] = Stream;
}
bool initializeDeviceStreamPool(const int DeviceId) {
assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
resizeStreamPool(DeviceId, EnvNumInitialStreams);
// Check the size of stream pool
if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
return false;
// Check whether each stream is valid
for (CUstream &S : StreamPool[DeviceId])
if (!S)
return false;
return true;
}
};
class DeviceRTLTy {
int NumberOfDevices;
// OpenMP environment properties
int EnvNumTeams;
int EnvTeamLimit;
// OpenMP requires flags
int64_t RequiresFlags;
static constexpr const int HardTeamLimit = 1U << 16U; // 64k
static constexpr const int HardThreadLimit = 1024;
static constexpr const int DefaultNumTeams = 128;
static constexpr const int DefaultNumThreads = 128;
std::unique_ptr<StreamManagerTy> StreamManager;
std::vector<DeviceDataTy> DeviceData;
std::vector<CUmodule> Modules;
/// A class responsible for interacting with device native runtime library to
/// allocate and free memory.
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
const int DeviceId;
const std::vector<DeviceDataTy> &DeviceData;
public:
CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
: DeviceId(DeviceId), DeviceData(DeviceData) {}
void *allocate(size_t Size, void *) override {
if (Size == 0)
return nullptr;
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return nullptr;
CUdeviceptr DevicePtr;
Err = cuMemAlloc(&DevicePtr, Size);
if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
return nullptr;
return (void *)DevicePtr;
}
int free(void *TgtPtr) override {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
Err = cuMemFree((CUdeviceptr)TgtPtr);
if (!checkResult(Err, "Error returned from cuMemFree\n"))
return OFFLOAD_FAIL;
return OFFLOAD_SUCCESS;
}
};
/// A vector of device allocators
std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
/// A vector of memory managers. Since the memory manager is non-copyable and
// non-removable, we wrap them into std::unique_ptr.
std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
/// Whether use memory manager
bool UseMemoryManager = true;
// Record entry point associated with device
void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
E.Entries.push_back(entry);
}
// Return a pointer to the entry associated with the pointer
const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
const void *Addr) const {
for (const __tgt_offload_entry &Itr :
DeviceData[DeviceId].FuncGblEntries.back().Entries)
if (Itr.addr == Addr)
return &Itr;
return nullptr;
}
// Return the pointer to the target entries table
__tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
if (E.Entries.empty())
return nullptr;
// Update table info according to the entries and return the pointer
E.Table.EntriesBegin = E.Entries.data();
E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
return &E.Table;
}
// Clear entries table for a device
void clearOffloadEntriesTable(const int DeviceId) {
DeviceData[DeviceId].FuncGblEntries.emplace_back();
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
}
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
assert(AsyncInfo && "AsyncInfo is nullptr");
if (!AsyncInfo->Queue)
AsyncInfo->Queue = StreamManager->getStream(DeviceId);
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
}
public:
// This class should not be copied
DeviceRTLTy(const DeviceRTLTy &) = delete;
DeviceRTLTy(DeviceRTLTy &&) = delete;
DeviceRTLTy()
: NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
RequiresFlags(OMP_REQ_UNDEFINED) {
DP("Start initializing CUDA\n");
CUresult Err = cuInit(0);
if (Err == CUDA_ERROR_INVALID_HANDLE) {
// Can't call cuGetErrorString if dlsym failed
DP("Failed to load CUDA shared library\n");
return;
}
if (!checkResult(Err, "Error returned from cuInit\n")) {
return;
}
Err = cuDeviceGetCount(&NumberOfDevices);
if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
return;
if (NumberOfDevices == 0) {
DP("There are no devices supporting CUDA.\n");
return;
}
DeviceData.resize(NumberOfDevices);
// Get environment variables regarding teams
if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
// OMP_TEAM_LIMIT has been set
EnvTeamLimit = std::stoi(EnvStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
}
if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
// OMP_NUM_TEAMS has been set
EnvNumTeams = std::stoi(EnvStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
}
StreamManager =
std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
for (int I = 0; I < NumberOfDevices; ++I)
DeviceAllocators.emplace_back(I, DeviceData);
// Get the size threshold from environment variable
std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
UseMemoryManager = Res.second;
size_t MemoryManagerThreshold = Res.first;
if (UseMemoryManager)
for (int I = 0; I < NumberOfDevices; ++I)
MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
DeviceAllocators[I], MemoryManagerThreshold));
}
~DeviceRTLTy() {
// We first destruct memory managers in case that its dependent data are
// destroyed before it.
for (auto &M : MemoryManagers)
M.release();
StreamManager = nullptr;
for (CUmodule &M : Modules)
// Close module
if (M)
checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
for (DeviceDataTy &D : DeviceData) {
// Destroy context
if (D.Context) {
checkResult(cuCtxSetCurrent(D.Context),
"Error returned from cuCtxSetCurrent\n");
CUdevice Device;
checkResult(cuCtxGetDevice(&Device),
"Error returned from cuCtxGetDevice\n");
checkResult(cuDevicePrimaryCtxRelease(Device),
"Error returned from cuDevicePrimaryCtxRelease\n");
}
}
}
// Check whether a given DeviceId is valid
bool isValidDeviceId(const int DeviceId) const {
return DeviceId >= 0 && DeviceId < NumberOfDevices;
}
int getNumOfDevices() const { return NumberOfDevices; }
void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
int initDevice(const int DeviceId) {
CUdevice Device;
DP("Getting device %d\n", DeviceId);
CUresult Err = cuDeviceGet(&Device, DeviceId);
if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
return OFFLOAD_FAIL;
// Query the current flags of the primary context and set its flags if
// it is inactive
unsigned int FormerPrimaryCtxFlags = 0;
int FormerPrimaryCtxIsActive = 0;
Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
&FormerPrimaryCtxIsActive);
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
return OFFLOAD_FAIL;
if (FormerPrimaryCtxIsActive) {
DP("The primary context is active, no change to its flags\n");
if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
CU_CTX_SCHED_BLOCKING_SYNC)
DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
} else {
DP("The primary context is inactive, set its flags to "
"CU_CTX_SCHED_BLOCKING_SYNC\n");
Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
return OFFLOAD_FAIL;
}
// Retain the per device primary context and save it to use whenever this
// device is selected.
Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
return OFFLOAD_FAIL;
Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
// Initialize stream pool
if (!StreamManager->initializeDeviceStreamPool(DeviceId))
return OFFLOAD_FAIL;
// Query attributes to determine number of threads/block and blocks/grid.
int MaxGridDimX;
Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting max grid dimension, use default value %d\n",
DeviceRTLTy::DefaultNumTeams);
DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
} else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
} else {
DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
"at the hard limit\n",
MaxGridDimX, DeviceRTLTy::HardTeamLimit);
DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
}
// We are only exploiting threads along the x axis.
int MaxBlockDimX;
Err = cuDeviceGetAttribute(&MaxBlockDimX,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting max block dimension, use default value %d\n",
DeviceRTLTy::DefaultNumThreads);
DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
} else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
DP("Using %d CUDA threads per block\n", MaxBlockDimX);
DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
} else {
DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
"capping at the hard limit\n",
MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
}
// Get and set warp size
int WarpSize;
Err =
cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting warp size, assume default value 32\n");
DeviceData[DeviceId].WarpSize = 32;
} else {
DP("Using warp size %d\n", WarpSize);
DeviceData[DeviceId].WarpSize = WarpSize;
}
// Adjust teams to the env variables
if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
EnvTeamLimit);
DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Device supports up to %d CUDA blocks and %d threads with a "
"warp size of %d\n",
DeviceData[DeviceId].BlocksPerGrid,
DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
// Set default number of teams
if (EnvNumTeams > 0) {
DP("Default number of teams set according to environment %d\n",
EnvNumTeams);
DeviceData[DeviceId].NumTeams = EnvNumTeams;
} else {
DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
DP("Default number of teams set according to library's default %d\n",
DeviceRTLTy::DefaultNumTeams);
}
if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
DP("Default number of teams exceeds device limit, capping at %d\n",
DeviceData[DeviceId].BlocksPerGrid);
DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
}
// Set default number of threads
DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
DP("Default number of threads set according to library's default %d\n",
DeviceRTLTy::DefaultNumThreads);
if (DeviceData[DeviceId].NumThreads >
DeviceData[DeviceId].ThreadsPerBlock) {
DP("Default number of threads exceeds device limit, capping at %d\n",
DeviceData[DeviceId].ThreadsPerBlock);
DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
}
return OFFLOAD_SUCCESS;
}
__tgt_target_table *loadBinary(const int DeviceId,
const __tgt_device_image *Image) {
// Set the context we are using
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return nullptr;
// Clear the offload table as we are going to create a new one.
clearOffloadEntriesTable(DeviceId);
// Create the module and extract the function pointers.
CUmodule Module;
DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
return nullptr;
DP("CUDA module successfully loaded!\n");
Modules.push_back(Module);
// Find the symbols in the module by name.
const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
if (!E->addr) {
// We return nullptr when something like this happens, the host should
// have always something in the address to uniquely identify the target
// region.
DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
return nullptr;
}
if (E->size) {
__tgt_offload_entry Entry = *E;
CUdeviceptr CUPtr;
size_t CUSize;
Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
// We keep this style here because we need the name
if (Err != CUDA_SUCCESS) {
REPORT("Loading global '%s' Failed\n", E->name);
CUDA_ERR_STRING(Err);
return nullptr;
}
if (CUSize != E->size) {
DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
CUSize, E->size);
return nullptr;
}
DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
Entry.addr = (void *)(CUPtr);
// Note: In the current implementation declare target variables
// can either be link or to. This means that once unified
// memory is activated via the requires directive, the variable
// can be used directly from the host in both cases.
// TODO: when variables types other than to or link are added,
// the below condition should be changed to explicitly
// check for to and link variables types:
// (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
// OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
// If unified memory is present any target link or to variables
// can access host addresses directly. There is no longer a
// need for device copies.
cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
DP("Copy linked variable host address (" DPxMOD
") to device address (" DPxMOD ")\n",
DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
}
addOffloadEntry(DeviceId, Entry);
continue;
}
CUfunction Func;
Err = cuModuleGetFunction(&Func, Module, E->name);
// We keep this style here because we need the name
if (Err != CUDA_SUCCESS) {
REPORT("Loading '%s' Failed\n", E->name);
CUDA_ERR_STRING(Err);
return nullptr;
}
DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
// default value GENERIC (in case symbol is missing from cubin file)
int8_t ExecModeVal = ExecutionModeType::GENERIC;
std::string ExecModeNameStr(E->name);
ExecModeNameStr += "_exec_mode";
const char *ExecModeName = ExecModeNameStr.c_str();
CUdeviceptr ExecModePtr;
size_t CUSize;
Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
if (Err == CUDA_SUCCESS) {
if (CUSize != sizeof(int8_t)) {
DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
ExecModeName, CUSize, sizeof(int8_t));
return nullptr;
}
Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from device to host. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
if (ExecModeVal < 0 || ExecModeVal > 1) {
DP("Error wrong exec_mode value specified in cubin file: %d\n",
ExecModeVal);
return nullptr;
}
} else {
REPORT("Loading global exec_mode '%s' - symbol missing, using default "
"value GENERIC (1)\n",
ExecModeName);
CUDA_ERR_STRING(Err);
}
KernelsList.emplace_back(Func, ExecModeVal);
__tgt_offload_entry Entry = *E;
Entry.addr = &KernelsList.back();
addOffloadEntry(DeviceId, Entry);
}
// send device environment data to the device
{
omptarget_device_environmentTy DeviceEnv{0};
#ifdef OMPTARGET_DEBUG
if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
DeviceEnv.debug_level = std::stoi(EnvStr);
#endif
const char *DeviceEnvName = "omptarget_device_environment";
CUdeviceptr DeviceEnvPtr;
size_t CUSize;
Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
if (Err == CUDA_SUCCESS) {
if (CUSize != sizeof(DeviceEnv)) {
REPORT(
"Global device_environment '%s' - size mismatch (%zu != %zu)\n",
DeviceEnvName, CUSize, sizeof(int32_t));
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
DP("Sending global device environment data %zu bytes\n", CUSize);
} else {
DP("Finding global device environment '%s' - symbol missing.\n",
DeviceEnvName);
DP("Continue, considering this is a device RTL which does not accept "
"environment setting.\n");
}
}
return getOffloadEntriesTable(DeviceId);
}
void *dataAlloc(const int DeviceId, const int64_t Size) {
if (UseMemoryManager)
return MemoryManagers[DeviceId]->allocate(Size, nullptr);
return DeviceAllocators[DeviceId].allocate(Size, nullptr);
}
int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
const int64_t Size, __tgt_async_info *AsyncInfo) const {
assert(AsyncInfo && "AsyncInfo is nullptr");
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
CUstream Stream = getStream(DeviceId, AsyncInfo);
Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
if (Err != CUDA_SUCCESS) {
DP("Error when copying data from host to device. Pointers: host "
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
const int64_t Size, __tgt_async_info *AsyncInfo) const {
assert(AsyncInfo && "AsyncInfo is nullptr");
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
CUstream Stream = getStream(DeviceId, AsyncInfo);
Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
if (Err != CUDA_SUCCESS) {
DP("Error when copying data from device to host. Pointers: host "
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
CUDA_ERR_STRING(Err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo) const {
assert(AsyncInfo && "AsyncInfo is nullptr");
CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
CUstream Stream = getStream(SrcDevId, AsyncInfo);
// If they are two devices, we try peer to peer copy first
if (SrcDevId != DstDevId) {
int CanAccessPeer = 0;
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
", dst = %" PRId32 "\n",
SrcDevId, DstDevId);
CUDA_ERR_STRING(Err);
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
if (!CanAccessPeer) {
DP("P2P memcpy not supported so fall back to D2D memcpy");
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
", dst = %" PRId32 "\n",
SrcDevId, DstDevId);
CUDA_ERR_STRING(Err);
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
Size, Stream);
if (Err == CUDA_SUCCESS)
return OFFLOAD_SUCCESS;
DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
CUDA_ERR_STRING(Err);
}
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
int dataDelete(const int DeviceId, void *TgtPtr) {
if (UseMemoryManager)
return MemoryManagers[DeviceId]->free(TgtPtr);
return DeviceAllocators[DeviceId].free(TgtPtr);
}
int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptrdiff_t *TgtOffsets, const int ArgNum,
const int TeamNum, const int ThreadLimit,
const unsigned int LoopTripCount,
__tgt_async_info *AsyncInfo) const {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
// All args are references.
std::vector<void *> Args(ArgNum);
std::vector<void *> Ptrs(ArgNum);
for (int I = 0; I < ArgNum; ++I) {
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
Args[I] = &Ptrs[I];
}
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
int CudaThreadsPerBlock;
if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
}
} else {
DP("Setting CUDA threads per block to default %d\n",
DeviceData[DeviceId].NumThreads);
CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
}
if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
DP("Threads per block capped at device limit %d\n",
DeviceData[DeviceId].ThreadsPerBlock);
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
}
if (!KernelInfo->MaxThreadsPerBlock) {
Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
KernelInfo->Func);
if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
return OFFLOAD_FAIL;
}
if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
DP("Threads per block capped at kernel limit %d\n",
KernelInfo->MaxThreadsPerBlock);
CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
}
unsigned int CudaBlocksPerGrid;
if (TeamNum <= 0) {
if (LoopTripCount > 0 && EnvNumTeams < 0) {
if (KernelInfo->ExecutionMode == SPMD) {
// We have a combined construct, i.e. `target teams distribute
// parallel for [simd]`. We launch so many teams so that each thread
// will execute one iteration of the loop. round up to the nearest
// integer
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
} else {
// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:
//
// #pragma omp target teams distribute
// for(...loop_tripcount...) {
// #pragma omp parallel for
// for(...) {}
// }
//
// Threads within a team will execute the iterations of the `parallel`
// loop.
CudaBlocksPerGrid = LoopTripCount;
}
DP("Using %d teams due to loop trip count %" PRIu32
" and number of threads per block %d\n",
CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
} else {
DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
}
} else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
DP("Capping number of teams to team limit %d\n",
DeviceData[DeviceId].BlocksPerGrid);
CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
} else {
DP("Using requested number of teams %d\n", TeamNum);
CudaBlocksPerGrid = TeamNum;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Launching kernel %s with %d blocks and %d threads in %s "
"mode\n",
(getOffloadEntry(DeviceId, TgtEntryPtr))
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
: "(null)",
CudaBlocksPerGrid, CudaThreadsPerBlock,
(KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
CUstream Stream = getStream(DeviceId, AsyncInfo);
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
/* gridDimZ */ 1, CudaThreadsPerBlock,
/* blockDimY */ 1, /* blockDimZ */ 1,
/* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
return OFFLOAD_FAIL;
DP("Launch of entry point at " DPxMOD " successful!\n",
DPxPTR(TgtEntryPtr));
return OFFLOAD_SUCCESS;
}
int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
CUresult Err = cuStreamSynchronize(Stream);
// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
StreamManager->returnStream(DeviceId,
reinterpret_cast<CUstream>(AsyncInfo->Queue));
AsyncInfo->Queue = nullptr;
if (Err != CUDA_SUCCESS) {
DP("Error when synchronizing stream. stream = " DPxMOD
", async info ptr = " DPxMOD "\n",
DPxPTR(Stream), DPxPTR(AsyncInfo));
CUDA_ERR_STRING(Err);
}
return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
}
};
DeviceRTLTy DeviceRTL;
} // namespace
// Exposed library API function
#ifdef __cplusplus
extern "C" {
#endif
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
return elf_check_machine(image, /* EM_CUDA */ 190);
}
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
DeviceRTL.setRequiresFlag(RequiresFlags);
return RequiresFlags;
}
int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
if (DeviceRTL.isValidDeviceId(src_dev_id) &&
DeviceRTL.isValidDeviceId(dst_dev_id))
return 1;
return 0;
}
int32_t __tgt_rtl_init_device(int32_t device_id) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
return DeviceRTL.initDevice(device_id);
}
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
__tgt_device_image *image) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
return DeviceRTL.loadBinary(device_id, image);
}
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
int32_t kind) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
if (kind != TARGET_ALLOC_DEFAULT) {
REPORT("Invalid target data allocation kind or requested allocator not "
"implemented yet\n");
return NULL;
}
return DeviceRTL.dataAlloc(device_id, size);
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
int64_t size) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
__tgt_async_info AsyncInfo;
const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
size, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(device_id, &AsyncInfo);
}
int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
void *hst_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");
return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
async_info_ptr);
}
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
int64_t size) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
__tgt_async_info AsyncInfo;
const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
size, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(device_id, &AsyncInfo);
}
int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
void *tgt_ptr, int64_t size,
__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");
return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
async_info_ptr);
}
int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
int dst_dev_id, void *dst_ptr,
int64_t size,
__tgt_async_info *AsyncInfo) {
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
assert(AsyncInfo && "AsyncInfo is nullptr");
return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
AsyncInfo);
}
int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
int32_t dst_dev_id, void *dst_ptr,
int64_t size) {
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
__tgt_async_info AsyncInfo;
const int32_t rc = __tgt_rtl_data_exchange_async(
src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
}
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
return DeviceRTL.dataDelete(device_id, tgt_ptr);
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args,
ptrdiff_t *tgt_offsets,
int32_t arg_num, int32_t team_num,
int32_t thread_limit,
uint64_t loop_tripcount) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
__tgt_async_info AsyncInfo;
const int32_t rc = __tgt_rtl_run_target_team_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
thread_limit, loop_tripcount, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(device_id, &AsyncInfo);
}
int32_t __tgt_rtl_run_target_team_region_async(
int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
int32_t thread_limit, uint64_t loop_tripcount,
__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
return DeviceRTL.runTargetTeamRegion(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
thread_limit, loop_tripcount, async_info_ptr);
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, ptrdiff_t *tgt_offsets,
int32_t arg_num) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
__tgt_async_info AsyncInfo;
const int32_t rc = __tgt_rtl_run_target_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
if (rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(device_id, &AsyncInfo);
}
int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
void *tgt_entry_ptr, void **tgt_args,
ptrdiff_t *tgt_offsets,
int32_t arg_num,
__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
return __tgt_rtl_run_target_team_region_async(
device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
/* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
async_info_ptr);
}
int32_t __tgt_rtl_synchronize(int32_t device_id,
__tgt_async_info *async_info_ptr) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
assert(async_info_ptr && "async_info_ptr is nullptr");
assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
return DeviceRTL.synchronize(device_id, async_info_ptr);
}
#ifdef __cplusplus
}
#endif