blob: 9981cf2f6508444f1fbb3f330744ec8b58f359a0 [file] [log] [blame]
//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
//
// RTL for CUDA machine
//
//===----------------------------------------------------------------------===//
#include <cassert>
#include <cstddef>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <list>
#include <string>
#include <vector>
#include "omptargetplugin.h"
#ifndef TARGET_NAME
#define TARGET_NAME CUDA
#endif
#define GETNAME2(name) #name
#define GETNAME(name) GETNAME2(name)
#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__)
#include "../../common/elf_common.c"
// Utility for retrieving and printing CUDA error string.
#ifdef CUDA_ERROR_REPORT
#define CUDA_ERR_STRING(err) \
do { \
const char *errStr; \
cuGetErrorString(err, &errStr); \
DP("CUDA error is: %s\n", errStr); \
} while (0)
#else
#define CUDA_ERR_STRING(err) \
{}
#endif
/// Keep entries table per device.
struct FuncOrGblEntryTy {
__tgt_target_table Table;
std::vector<__tgt_offload_entry> Entries;
};
enum ExecutionModeType {
SPMD, // constructors, destructors,
// combined constructs (`teams distribute parallel for [simd]`)
GENERIC, // everything else
NONE
};
/// Use a single entity to encode a kernel and a set of flags
struct KernelTy {
CUfunction Func;
// execution mode of kernel
// 0 - SPMD mode (without master warp)
// 1 - Generic mode (with master warp)
int8_t ExecutionMode;
KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}
};
/// List that contains all the kernels.
/// FIXME: we may need this to be per device and per library.
std::list<KernelTy> KernelsList;
/// Class containing all the device information.
class RTLDeviceInfoTy {
std::vector<FuncOrGblEntryTy> FuncGblEntries;
public:
int NumberOfDevices;
std::vector<CUmodule> Modules;
std::vector<CUcontext> Contexts;
// Device properties
std::vector<int> ThreadsPerBlock;
std::vector<int> BlocksPerGrid;
std::vector<int> WarpSize;
// OpenMP properties
std::vector<int> NumTeams;
std::vector<int> NumThreads;
// OpenMP Environment properties
int EnvNumTeams;
int EnvTeamLimit;
//static int EnvNumThreads;
static const int HardTeamLimit = 1<<16; // 64k
static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;
static const int DefaultNumThreads = 128;
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id];
E.Entries.push_back(entry);
}
// Return true if the entry is associated with device
bool findOffloadEntry(int32_t device_id, void *addr) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id];
for (auto &it : E.Entries) {
if (it.addr == addr)
return true;
}
return false;
}
// Return the pointer to the target entries table
__tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id];
int32_t size = E.Entries.size();
// Table is empty
if (!size)
return 0;
__tgt_offload_entry *begin = &E.Entries[0];
__tgt_offload_entry *end = &E.Entries[size - 1];
// Update table info according to the entries and return the pointer
E.Table.EntriesBegin = begin;
E.Table.EntriesEnd = ++end;
return &E.Table;
}
// Clear entries table for a device
void clearOffloadEntriesTable(int32_t device_id) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[device_id];
E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
}
RTLDeviceInfoTy() {
DP("Start initializing CUDA\n");
CUresult err = cuInit(0);
if (err != CUDA_SUCCESS) {
DP("Error when initializing CUDA\n");
CUDA_ERR_STRING(err);
return;
}
NumberOfDevices = 0;
err = cuDeviceGetCount(&NumberOfDevices);
if (err != CUDA_SUCCESS) {
DP("Error when getting CUDA device count\n");
CUDA_ERR_STRING(err);
return;
}
if (NumberOfDevices == 0) {
DP("There are no devices supporting CUDA.\n");
return;
}
FuncGblEntries.resize(NumberOfDevices);
Contexts.resize(NumberOfDevices);
ThreadsPerBlock.resize(NumberOfDevices);
BlocksPerGrid.resize(NumberOfDevices);
WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);
// Get environment variables regarding teams
char *envStr = getenv("OMP_TEAM_LIMIT");
if (envStr) {
// OMP_TEAM_LIMIT has been set
EnvTeamLimit = std::stoi(envStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
} else {
EnvTeamLimit = -1;
}
envStr = getenv("OMP_NUM_TEAMS");
if (envStr) {
// OMP_NUM_TEAMS has been set
EnvNumTeams = std::stoi(envStr);
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
} else {
EnvNumTeams = -1;
}
}
~RTLDeviceInfoTy() {
// Close modules
for (auto &module : Modules)
if (module) {
CUresult err = cuModuleUnload(module);
if (err != CUDA_SUCCESS) {
DP("Error when unloading CUDA module\n");
CUDA_ERR_STRING(err);
}
}
// Destroy contexts
for (auto &ctx : Contexts)
if (ctx) {
CUresult err = cuCtxDestroy(ctx);
if (err != CUDA_SUCCESS) {
DP("Error when destroying CUDA context\n");
CUDA_ERR_STRING(err);
}
}
}
};
static RTLDeviceInfoTy DeviceInfo;
#ifdef __cplusplus
extern "C" {
#endif
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
return elf_check_machine(image, 190); // EM_CUDA = 190.
}
int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
int32_t __tgt_rtl_init_device(int32_t device_id) {
CUdevice cuDevice;
DP("Getting device %d\n", device_id);
CUresult err = cuDeviceGet(&cuDevice, device_id);
if (err != CUDA_SUCCESS) {
DP("Error when getting CUDA device with id = %d\n", device_id);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
// Create the context and save it to use whenever this device is selected.
err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
cuDevice);
if (err != CUDA_SUCCESS) {
DP("Error when creating a CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
// scan properties to determine number of threads/block and blocks/grid.
struct cudaDeviceProp Properties;
cudaError_t error = cudaGetDeviceProperties(&Properties, device_id);
if (error != cudaSuccess) {
DP("Error getting device Properties, use defaults\n");
DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
DeviceInfo.WarpSize[device_id] = 32;
} else {
// Get blocks per grid
if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
} else {
DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
"at the hard limit\n", Properties.maxGridSize[0],
RTLDeviceInfoTy::HardTeamLimit);
}
// Get threads per block, exploit threads only along x axis
if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
DP("(fewer than max per block along all xyz dims %d)\n",
Properties.maxThreadsPerBlock);
}
} else {
DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
"capping at the hard limit\n", Properties.maxThreadsDim[0],
RTLDeviceInfoTy::HardThreadLimit);
}
// Get warp size
DeviceInfo.WarpSize[device_id] = Properties.warpSize;
}
// Adjust teams to the env variables
if (DeviceInfo.EnvTeamLimit > 0 &&
DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
DeviceInfo.EnvTeamLimit);
}
DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
DeviceInfo.WarpSize[device_id]);
// Set default number of teams
if (DeviceInfo.EnvNumTeams > 0) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
DP("Default number of teams set according to environment %d\n",
DeviceInfo.EnvNumTeams);
} else {
DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
DP("Default number of teams set according to library's default %d\n",
RTLDeviceInfoTy::DefaultNumTeams);
}
if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
DP("Default number of teams exceeds device limit, capping at %d\n",
DeviceInfo.BlocksPerGrid[device_id]);
}
// Set default number of threads
DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
DP("Default number of threads set according to library's default %d\n",
RTLDeviceInfoTy::DefaultNumThreads);
if (DeviceInfo.NumThreads[device_id] >
DeviceInfo.ThreadsPerBlock[device_id]) {
DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
DP("Default number of threads exceeds device limit, capping at %d\n",
DeviceInfo.ThreadsPerBlock[device_id]);
}
return OFFLOAD_SUCCESS;
}
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
__tgt_device_image *image) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error when setting a CUDA context for device %d\n", device_id);
CUDA_ERR_STRING(err);
return NULL;
}
// Clear the offload table as we are going to create a new one.
DeviceInfo.clearOffloadEntriesTable(device_id);
// Create the module and extract the function pointers.
CUmodule cumod;
DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
if (err != CUDA_SUCCESS) {
DP("Error when loading CUDA module\n");
CUDA_ERR_STRING(err);
return NULL;
}
DP("CUDA module successfully loaded!\n");
DeviceInfo.Modules.push_back(cumod);
// Find the symbols in the module by name.
__tgt_offload_entry *HostBegin = image->EntriesBegin;
__tgt_offload_entry *HostEnd = image->EntriesEnd;
for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
if (!e->addr) {
// We return NULL when something like this happens, the host should have
// always something in the address to uniquely identify the target region.
DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
return NULL;
}
if (e->size) {
__tgt_offload_entry entry = *e;
CUdeviceptr cuptr;
size_t cusize;
err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
if (err != CUDA_SUCCESS) {
DP("Loading global '%s' (Failed)\n", e->name);
CUDA_ERR_STRING(err);
return NULL;
}
if (cusize != e->size) {
DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
cusize, e->size);
CUDA_ERR_STRING(err);
return NULL;
}
DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
entry.addr = (void *)cuptr;
DeviceInfo.addOffloadEntry(device_id, entry);
continue;
}
CUfunction fun;
err = cuModuleGetFunction(&fun, cumod, e->name);
if (err != CUDA_SUCCESS) {
DP("Loading '%s' (Failed)\n", e->name);
CUDA_ERR_STRING(err);
return NULL;
}
DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
// default value GENERIC (in case symbol is missing from cubin file)
int8_t ExecModeVal = ExecutionModeType::GENERIC;
std::string ExecModeNameStr (e->name);
ExecModeNameStr += "_exec_mode";
const char *ExecModeName = ExecModeNameStr.c_str();
CUdeviceptr ExecModePtr;
size_t cusize;
err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
if (err == CUDA_SUCCESS) {
if ((size_t)cusize != sizeof(int8_t)) {
DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
ExecModeName, cusize, sizeof(int8_t));
CUDA_ERR_STRING(err);
return NULL;
}
err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
if (err != CUDA_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
CUDA_ERR_STRING(err);
return NULL;
}
if (ExecModeVal < 0 || ExecModeVal > 1) {
DP("Error wrong exec_mode value specified in cubin file: %d\n",
ExecModeVal);
return NULL;
}
} else {
DP("Loading global exec_mode '%s' - symbol missing, using default value "
"GENERIC (1)\n", ExecModeName);
CUDA_ERR_STRING(err);
}
KernelsList.push_back(KernelTy(fun, ExecModeVal));
__tgt_offload_entry entry = *e;
entry.addr = (void *)&KernelsList.back();
DeviceInfo.addOffloadEntry(device_id, entry);
}
return DeviceInfo.getOffloadEntriesTable(device_id);
}
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
if (size == 0) {
return NULL;
}
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error while trying to set CUDA current context\n");
CUDA_ERR_STRING(err);
return NULL;
}
CUdeviceptr ptr;
err = cuMemAlloc(&ptr, size);
if (err != CUDA_SUCCESS) {
DP("Error while trying to allocate %d\n", err);
CUDA_ERR_STRING(err);
return NULL;
}
void *vptr = (void *)ptr;
return vptr;
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
int64_t size) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
if (err != CUDA_SUCCESS) {
DP("Error when copying data from host to device. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
DPxPTR(tgt_ptr), size);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
int64_t size) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
if (err != CUDA_SUCCESS) {
DP("Error when copying data from device to host. Pointers: host = " DPxMOD
", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
DPxPTR(tgt_ptr), size);
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
err = cuMemFree((CUdeviceptr)tgt_ptr);
if (err != CUDA_SUCCESS) {
DP("Error when freeing CUDA memory\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
int32_t thread_limit, uint64_t loop_tripcount) {
// Set the context we are using.
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
if (err != CUDA_SUCCESS) {
DP("Error when setting CUDA context\n");
CUDA_ERR_STRING(err);
return OFFLOAD_FAIL;
}
// All args are references.
std::vector<void *> args(arg_num);
std::vector<void *> ptrs(arg_num);
for (int32_t i = 0; i < arg_num; ++i) {
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
args[i] = &ptrs[i];
}
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
int cudaThreadsPerBlock;
if (thread_limit > 0) {
cudaThreadsPerBlock = thread_limit;
DP("Setting CUDA threads per block to requested %d\n", thread_limit);
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
}
} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);
}
if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",
DeviceInfo.ThreadsPerBlock[device_id]);
}
int kernel_limit;
err = cuFuncGetAttribute(&kernel_limit,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
if (err == CUDA_SUCCESS) {
if (kernel_limit < cudaThreadsPerBlock) {
cudaThreadsPerBlock = kernel_limit;
DP("Threads per block capped at kernel limit %d\n", kernel_limit);
}
}
int cudaBlocksPerGrid;
if (team_num <= 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
if (KernelInfo->ExecutionMode == SPMD) {
// We have a combined construct, i.e. `target teams distribute parallel
// for [simd]`. We launch so many teams so that each thread will
// execute one iteration of the loop.
// round up to the nearest integer
cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
} else {
// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:
//
// #pragma omp target teams distribute
// for(...loop_tripcount...) {
// #pragma omp parallel for
// for(...) {}
// }
//
// Threads within a team will execute the iterations of the `parallel`
// loop.
cudaBlocksPerGrid = loop_tripcount;
}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
cudaThreadsPerBlock);
} else {
cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
}
} else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
DP("Capping number of teams to team limit %d\n",
DeviceInfo.BlocksPerGrid[device_id]);
} else {
cudaBlocksPerGrid = team_num;
DP("Using requested number of teams %d\n", team_num);
}
// Run on the device.
DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
cudaThreadsPerBlock);
err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
if (err != CUDA_SUCCESS) {
DP("Device kernel launch failed!\n");
CUDA_ERR_STRING(err);
assert(err == CUDA_SUCCESS && "Unable to launch target execution!");
return OFFLOAD_FAIL;
}
DP("Launch of entry point at " DPxMOD " successful!\n",
DPxPTR(tgt_entry_ptr));
cudaError_t sync_error = cudaDeviceSynchronize();
if (sync_error != cudaSuccess) {
DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr),
cudaGetErrorString(sync_error));
return OFFLOAD_FAIL;
} else {
DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
}
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
// use one team and the default number of threads.
const int32_t team_num = 1;
const int32_t thread_limit = 0;
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
tgt_offsets, arg_num, team_num, thread_limit, 0);
}
#ifdef __cplusplus
}
#endif