blob: 9b4bf41418611d44a3f14752b14fc439ffa257b4 [file] [log] [blame]
//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// RTL for AMD hsa machine
//
//===----------------------------------------------------------------------===//
#include <algorithm>
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <libelf.h>
#include <list>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <unordered_map>
#include <vector>
#include "impl_runtime.h"
#include "interop_hsa.h"
#include "internal.h"
#include "rt.h"
#include "DeviceEnvironment.h"
#include "get_elf_mach_gfx_name.h"
#include "omptargetplugin.h"
#include "print_tracing.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace llvm;
// hostrpc interface, FIXME: consider moving to its own include these are
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
// linked as --whole-archive to override the weak symbols that are used to
// implement a fallback for toolchains that do not yet have a hostrpc library.
extern "C" {
uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
uint32_t DeviceId);
hsa_status_t hostrpc_init();
hsa_status_t hostrpc_terminate();
__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
__attribute__((weak)) hsa_status_t hostrpc_terminate() {
return HSA_STATUS_SUCCESS;
}
__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
uint32_t DeviceId) {
DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
"missing\n",
DeviceId);
return 0;
}
}
// Heuristic parameters used for kernel launch
// Number of teams per CU to allow scheduling flexibility
static const unsigned DefaultTeamsPerCU = 4;
int print_kernel_trace;
#ifdef OMPTARGET_DEBUG
#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
DP(#msg " failed\n"); \
} else { \
DP(#msg " succeeded\n"); \
}
#else
#define check(msg, status) \
{}
#endif
#include "elf_common.h"
namespace hsa {
template <typename C> hsa_status_t iterate_agents(C Cb) {
auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
C *Unwrapped = static_cast<C *>(Data);
return (*Unwrapped)(Agent);
};
return hsa_iterate_agents(L, static_cast<void *>(&Cb));
}
template <typename C>
hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
C *Unwrapped = static_cast<C *>(Data);
return (*Unwrapped)(MemoryPool);
};
return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
}
} // namespace hsa
/// Keep entries table per device
struct FuncOrGblEntryTy {
__tgt_target_table Table;
std::vector<__tgt_offload_entry> Entries;
};
struct KernelArgPool {
private:
static pthread_mutex_t Mutex;
public:
uint32_t KernargSegmentSize;
void *KernargRegion = nullptr;
std::queue<int> FreeKernargSegments;
uint32_t kernargSizeIncludingImplicit() {
return KernargSegmentSize + sizeof(impl_implicit_args_t);
}
~KernelArgPool() {
if (KernargRegion) {
auto R = hsa_amd_memory_pool_free(KernargRegion);
if (R != HSA_STATUS_SUCCESS) {
DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
}
}
}
// Can't really copy or move a mutex
KernelArgPool() = default;
KernelArgPool(const KernelArgPool &) = delete;
KernelArgPool(KernelArgPool &&) = delete;
KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
: KernargSegmentSize(KernargSegmentSize) {
// impl uses one pool per kernel for all gpus, with a fixed upper size
// preserving that exact scheme here, including the queue<int>
hsa_status_t Err = hsa_amd_memory_pool_allocate(
MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
&KernargRegion);
if (Err != HSA_STATUS_SUCCESS) {
DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
KernargRegion = nullptr; // paranoid
return;
}
Err = core::allow_access_to_all_gpu_agents(KernargRegion);
if (Err != HSA_STATUS_SUCCESS) {
DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
get_error_string(Err));
auto R = hsa_amd_memory_pool_free(KernargRegion);
if (R != HSA_STATUS_SUCCESS) {
// if free failed, can't do anything more to resolve it
DP("hsa memory poll free failed: %s\n", get_error_string(Err));
}
KernargRegion = nullptr;
return;
}
for (int I = 0; I < MAX_NUM_KERNELS; I++) {
FreeKernargSegments.push(I);
}
}
void *allocate(uint64_t ArgNum) {
assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
Lock L(&Mutex);
void *Res = nullptr;
if (!FreeKernargSegments.empty()) {
int FreeIdx = FreeKernargSegments.front();
Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
(FreeIdx * kernargSizeIncludingImplicit()));
assert(FreeIdx == pointerToIndex(Res));
FreeKernargSegments.pop();
}
return Res;
}
void deallocate(void *Ptr) {
Lock L(&Mutex);
int Idx = pointerToIndex(Ptr);
FreeKernargSegments.push(Idx);
}
private:
int pointerToIndex(void *Ptr) {
ptrdiff_t Bytes =
static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
assert(Bytes >= 0);
assert(Bytes % kernargSizeIncludingImplicit() == 0);
return Bytes / kernargSizeIncludingImplicit();
}
struct Lock {
Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
~Lock() { pthread_mutex_unlock(M); }
pthread_mutex_t *M;
};
};
pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
KernelArgPoolMap;
/// Use a single entity to encode a kernel and a set of flags
struct KernelTy {
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
int16_t ConstWGSize;
int32_t DeviceId;
void *CallStackAddr = nullptr;
const char *Name;
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
int32_t DeviceId, void *CallStackAddr, const char *Name,
uint32_t KernargSegmentSize,
hsa_amd_memory_pool_t &KernArgMemoryPool)
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
std::string N(Name);
if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
KernelArgPoolMap.insert(
std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
KernargSegmentSize, KernArgMemoryPool))));
}
}
};
/// List that contains all the kernels.
/// FIXME: we may need this to be per device and per library.
std::list<KernelTy> KernelsList;
template <typename Callback> static hsa_status_t findAgents(Callback CB) {
hsa_status_t Err =
hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
hsa_device_type_t DeviceType;
// get_info fails iff HSA runtime not yet initialized
hsa_status_t Err =
hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
if (Err != HSA_STATUS_SUCCESS) {
if (print_kernel_trace > 0)
DP("rtl.cpp: err %s\n", get_error_string(Err));
return Err;
}
CB(DeviceType, Agent);
return HSA_STATUS_SUCCESS;
});
// iterate_agents fails iff HSA runtime not yet initialized
if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
DP("rtl.cpp: err %s\n", get_error_string(Err));
}
return Err;
}
static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
void *Data) {
if (Status != HSA_STATUS_SUCCESS) {
const char *StatusString;
if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
StatusString = "unavailable";
}
DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
Status, StatusString);
abort();
}
}
namespace core {
namespace {
bool checkResult(hsa_status_t Err, const char *ErrMsg) {
if (Err == HSA_STATUS_SUCCESS)
return true;
REPORT("%s", ErrMsg);
REPORT("%s", get_error_string(Err));
return false;
}
void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
__atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
}
uint16_t createHeader() {
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
return Header;
}
hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
bool AllocAllowed = false;
hsa_status_t Err = hsa_amd_memory_pool_get_info(
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
&AllocAllowed);
if (Err != HSA_STATUS_SUCCESS) {
DP("Alloc allowed in memory pool check failed: %s\n",
get_error_string(Err));
return Err;
}
size_t Size = 0;
Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
&Size);
if (Err != HSA_STATUS_SUCCESS) {
DP("Get memory pool size failed: %s\n", get_error_string(Err));
return Err;
}
return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
}
hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
std::vector<hsa_amd_memory_pool_t> *Result =
static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
hsa_status_t Err;
if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
return Err;
}
Result->push_back(MemoryPool);
return HSA_STATUS_SUCCESS;
}
} // namespace
} // namespace core
struct EnvironmentVariables {
int NumTeams;
int TeamLimit;
int TeamThreadLimit;
int MaxTeamsDefault;
int DynamicMemSize;
};
template <uint32_t wavesize>
static constexpr const llvm::omp::GV &getGridValue() {
return llvm::omp::getAMDGPUGridValues<wavesize>();
}
struct HSALifetime {
// Wrapper around HSA used to ensure it is constructed before other types
// and destructed after, which means said other types can use raii for
// cleanup without risking running outside of the lifetime of HSA
const hsa_status_t S;
bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetime() : S(hsa_init()) {}
~HSALifetime() {
if (S == HSA_STATUS_SUCCESS) {
hsa_status_t Err = hsa_shut_down();
if (Err != HSA_STATUS_SUCCESS) {
// Can't call into HSA to get a string from the integer
DP("Shutting down HSA failed: %d\n", Err);
}
}
}
};
// Handle scheduling of multiple hsa_queue's per device to
// multiple threads (one scheduler per device)
class HSAQueueScheduler {
public:
HSAQueueScheduler() : Current(0) {}
HSAQueueScheduler(const HSAQueueScheduler &) = delete;
HSAQueueScheduler(HSAQueueScheduler &&Q) {
Current = Q.Current.load();
for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
HSAQueues[I] = Q.HSAQueues[I];
Q.HSAQueues[I] = nullptr;
}
}
// \return false if any HSA queue creation fails
bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
hsa_queue_t *Q = nullptr;
hsa_status_t Rc =
hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
if (Rc != HSA_STATUS_SUCCESS) {
DP("Failed to create HSA queue %d\n", I);
return false;
}
HSAQueues[I] = Q;
}
return true;
}
~HSAQueueScheduler() {
for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
if (HSAQueues[I]) {
hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
if (Err != HSA_STATUS_SUCCESS)
DP("Error destroying HSA queue");
}
}
}
// \return next queue to use for device
hsa_queue_t *next() {
return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
NUM_QUEUES_PER_DEVICE];
}
private:
// Number of queues per device
enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
std::atomic<uint8_t> Current;
};
/// Class containing all the device information
class RTLDeviceInfoTy : HSALifetime {
std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
struct QueueDeleter {
void operator()(hsa_queue_t *Q) {
if (Q) {
hsa_status_t Err = hsa_queue_destroy(Q);
if (Err != HSA_STATUS_SUCCESS) {
DP("Error destroying hsa queue: %s\n", get_error_string(Err));
}
}
}
};
public:
bool ConstructionSucceeded = false;
// load binary populates symbol tables and mutates various global state
// run uses those symbol tables
std::shared_timed_mutex LoadRunLock;
int NumberOfDevices = 0;
// GPU devices
std::vector<hsa_agent_t> HSAAgents;
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
// CPUs
std::vector<hsa_agent_t> CPUAgents;
// Device properties
std::vector<int> ComputeUnits;
std::vector<int> GroupsPerDevice;
std::vector<int> ThreadsPerGroup;
std::vector<int> WarpSize;
std::vector<std::string> GPUName;
std::vector<std::string> TargetID;
// OpenMP properties
std::vector<int> NumTeams;
std::vector<int> NumThreads;
// OpenMP Environment properties
EnvironmentVariables Env;
// OpenMP Requires Flags
int64_t RequiresFlags;
// Resource pools
SignalPoolT FreeSignalPool;
bool HostcallRequired = false;
std::vector<hsa_executable_t> HSAExecutables;
std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
hsa_amd_memory_pool_t KernArgPool;
// fine grained memory pool for host allocations
hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
// fine and coarse-grained memory pools per offloading device
std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
struct ImplFreePtrDeletor {
void operator()(void *P) {
core::Runtime::Memfree(P); // ignore failure to free
}
};
// device_State shared across loaded binaries, error if inconsistent size
std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
DeviceStateStore;
static const unsigned HardTeamLimit =
(1 << 16) - 1; // 64K needed to fit in uint16
static const int DefaultNumTeams = 128;
// These need to be per-device since different devices can have different
// wave sizes, but are currently the same number for each so that refactor
// can be postponed.
static_assert(getGridValue<32>().GV_Max_Teams ==
getGridValue<64>().GV_Max_Teams,
"");
static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
static_assert(getGridValue<32>().GV_Max_WG_Size ==
getGridValue<64>().GV_Max_WG_Size,
"");
static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
static_assert(getGridValue<32>().GV_Default_WG_Size ==
getGridValue<64>().GV_Default_WG_Size,
"");
static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
hsa_agent_t, hsa_amd_memory_pool_t);
hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
MemcpyFunc Func, int32_t DeviceId) {
hsa_agent_t Agent = HSAAgents[DeviceId];
hsa_signal_t S = FreeSignalPool.pop();
if (S.handle == 0) {
return HSA_STATUS_ERROR;
}
hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
FreeSignalPool.push(S);
return R;
}
hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
int32_t DeviceId) {
return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
}
hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
int32_t DeviceId) {
return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
}
static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
char TmpChar[1000];
uint16_t Major, Minor;
uint32_t TmpUInt;
uint32_t TmpUInt2;
uint32_t CacheSize[4];
bool TmpBool;
uint16_t WorkgroupMaxDim[3];
hsa_dim3_t GridMaxDim;
// Getting basic information about HSA and Device
core::checkResult(
hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
"Error from hsa_system_get_info when obtaining "
"HSA_SYSTEM_INFO_VERSION_MAJOR\n");
core::checkResult(
hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
"Error from hsa_system_get_info when obtaining "
"HSA_SYSTEM_INFO_VERSION_MINOR\n");
printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
core::checkResult(
hsa_agent_get_info(
Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
printf(" Product Name: \t\t\t%s \n", TmpChar);
core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_NAME\n");
printf(" Device Name: \t\t\t%s \n", TmpChar);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_NAME\n");
printf(" Vendor Name: \t\t\t%s \n", TmpChar);
hsa_device_type_t DevType;
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_DEVICE\n");
printf(" Device Type: \t\t\t%s \n",
DevType == HSA_DEVICE_TYPE_CPU
? "CPU"
: (DevType == HSA_DEVICE_TYPE_GPU
? "GPU"
: (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_QUEUES_MAX\n");
printf(" Max Queues: \t\t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
// Getting cache information
printf(" Cache:\n");
// FIXME: This is deprecated according to HSA documentation. But using
// hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
// runtime.
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_CACHE_SIZE\n");
for (int I = 0; I < 4; I++) {
if (CacheSize[I]) {
printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
}
}
core::checkResult(
hsa_agent_get_info(Agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
&TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(
Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
&TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(
Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
&TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
printf(" Compute Units: \t\t\t%u \n", TmpUInt);
core::checkResult(hsa_agent_get_info(
Agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
&TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
core::checkResult(hsa_agent_get_info(Agent,
HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
WorkgroupMaxDim),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
printf(" Workgroup Max Size per Dimension:\n");
printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
core::checkResult(hsa_agent_get_info(
Agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
&TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_GRID_MAX_SIZE\n");
printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_GRID_MAX_DIM\n");
printf(" Grid Max Size per Dimension: \t\t\n");
printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
core::checkResult(
hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
"Error returned from hsa_agent_get_info when obtaining "
"HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
printf(" Memory Pools:\n");
auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
std::string TmpStr;
size_t Size;
bool Alloc, Access;
hsa_amd_segment_t Segment;
hsa_amd_memory_pool_global_flag_t GlobalFlags;
core::checkResult(
hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
"Error returned from hsa_amd_memory_pool_get_info when obtaining "
"HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
core::checkResult(hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
"Error returned from hsa_amd_memory_pool_get_info when "
"obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
switch (Segment) {
case HSA_AMD_SEGMENT_GLOBAL:
TmpStr = "GLOBAL; FLAGS: ";
if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
TmpStr += "KERNARG, ";
if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
TmpStr += "FINE GRAINED, ";
if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
TmpStr += "COARSE GRAINED, ";
break;
case HSA_AMD_SEGMENT_READONLY:
TmpStr = "READONLY";
break;
case HSA_AMD_SEGMENT_PRIVATE:
TmpStr = "PRIVATE";
break;
case HSA_AMD_SEGMENT_GROUP:
TmpStr = "GROUP";
break;
}
printf(" Pool %s: \n", TmpStr.c_str());
core::checkResult(hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
"Error returned from hsa_amd_memory_pool_get_info when "
"obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
printf(" Size: \t\t\t\t %zu bytes\n", Size);
core::checkResult(
hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
"Error returned from hsa_amd_memory_pool_get_info when obtaining "
"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
core::checkResult(
hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
"Error returned from hsa_amd_memory_pool_get_info when obtaining "
"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
core::checkResult(
hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
"Error returned from hsa_amd_memory_pool_get_info when obtaining "
"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
core::checkResult(
hsa_amd_memory_pool_get_info(
Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
"Error returned from hsa_amd_memory_pool_get_info when obtaining "
"HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
printf(" Accessable by all: \t\t %s\n",
(Access ? "TRUE" : "FALSE"));
return HSA_STATUS_SUCCESS;
};
// Iterate over all the memory regions for this agent. Get the memory region
// type and size
hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
printf(" ISAs:\n");
auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
char TmpChar[1000];
core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
"Error returned from hsa_isa_get_info_alt when "
"obtaining HSA_ISA_INFO_NAME\n");
printf(" Name: \t\t\t\t %s\n", TmpChar);
return HSA_STATUS_SUCCESS;
};
// Iterate over all the memory regions for this agent. Get the memory region
// type and size
hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
}
// Record entry point associated with device
void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
assert(DeviceId < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
E.Entries.push_back(Entry);
}
// Return true if the entry is associated with device
bool findOffloadEntry(int32_t DeviceId, void *Addr) {
assert(DeviceId < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
for (auto &It : E.Entries) {
if (It.addr == Addr)
return true;
}
return false;
}
// Return the pointer to the target entries table
__tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
assert(DeviceId < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
int32_t Size = E.Entries.size();
// Table is empty
if (!Size)
return 0;
__tgt_offload_entry *Begin = &E.Entries[0];
__tgt_offload_entry *End = &E.Entries[Size - 1];
// Update table info according to the entries and return the pointer
E.Table.EntriesBegin = Begin;
E.Table.EntriesEnd = ++End;
return &E.Table;
}
// Clear entries table for a device
void clearOffloadEntriesTable(int DeviceId) {
assert(DeviceId < (int32_t)FuncGblEntries.size() &&
"Unexpected device id!");
FuncGblEntries[DeviceId].emplace_back();
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
// KernelArgPoolMap.clear();
E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
}
hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
unsigned int DeviceId) {
assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
uint32_t GlobalFlags = 0;
hsa_status_t Err = hsa_amd_memory_pool_get_info(
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
if (Err != HSA_STATUS_SUCCESS) {
return Err;
}
if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
} else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
if (ValidStatus != HSA_STATUS_SUCCESS) {
DP("Alloc allowed in memory pool check failed: %s\n",
get_error_string(ValidStatus));
return HSA_STATUS_SUCCESS;
}
return addDeviceMemoryPool(MemoryPool, DeviceId);
});
if (Err != HSA_STATUS_SUCCESS) {
DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
"Iterate all memory pools", get_error_string(Err));
return Err;
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
std::vector<hsa_amd_memory_pool_t> HostPools;
// collect all the "valid" pools for all the given agents.
for (const auto &Agent : Agents) {
hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
if (Err != HSA_STATUS_SUCCESS) {
DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
}
}
// We need two fine-grained pools.
// 1. One with kernarg flag set for storing kernel arguments
// 2. Second for host allocations
bool FineGrainedMemoryPoolSet = false;
bool KernArgPoolSet = false;
for (const auto &MemoryPool : HostPools) {
hsa_status_t Err = HSA_STATUS_SUCCESS;
uint32_t GlobalFlags = 0;
Err = hsa_amd_memory_pool_get_info(
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
if (Err != HSA_STATUS_SUCCESS) {
DP("Get memory pool info failed: %s\n", get_error_string(Err));
return Err;
}
if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
KernArgPool = MemoryPool;
KernArgPoolSet = true;
}
HostFineGrainedMemoryPool = MemoryPool;
FineGrainedMemoryPoolSet = true;
}
}
if (FineGrainedMemoryPoolSet && KernArgPoolSet)
return HSA_STATUS_SUCCESS;
return HSA_STATUS_ERROR;
}
hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
"Invalid device Id");
return DeviceCoarseGrainedMemoryPools[DeviceId];
}
hsa_amd_memory_pool_t getHostMemoryPool() {
return HostFineGrainedMemoryPool;
}
static int readEnv(const char *Env, int Default = -1) {
const char *EnvStr = getenv(Env);
int Res = Default;
if (EnvStr) {
Res = std::stoi(EnvStr);
DP("Parsed %s=%d\n", Env, Res);
}
return Res;
}
RTLDeviceInfoTy() {
DP("Start initializing " GETNAME(TARGET_NAME) "\n");
// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
// anytime. You do not need a debug library build.
// 0 => no tracing
// 1 => tracing dispatch only
// >1 => verbosity increase
if (!HSAInitSuccess()) {
DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
return;
}
if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
print_kernel_trace = atoi(EnvStr);
else
print_kernel_trace = 0;
hsa_status_t Err = core::atl_init_gpu_context();
if (Err != HSA_STATUS_SUCCESS) {
DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
return;
}
// Init hostcall soon after initializing hsa
hostrpc_init();
Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
if (DeviceType == HSA_DEVICE_TYPE_CPU) {
CPUAgents.push_back(Agent);
} else {
HSAAgents.push_back(Agent);
}
});
if (Err != HSA_STATUS_SUCCESS)
return;
NumberOfDevices = (int)HSAAgents.size();
if (NumberOfDevices == 0) {
DP("There are no devices supporting HSA.\n");
return;
}
DP("There are %d devices supporting HSA.\n", NumberOfDevices);
// Init the device info
HSAQueueSchedulers.reserve(NumberOfDevices);
FuncGblEntries.resize(NumberOfDevices);
ThreadsPerGroup.resize(NumberOfDevices);
ComputeUnits.resize(NumberOfDevices);
GPUName.resize(NumberOfDevices);
GroupsPerDevice.resize(NumberOfDevices);
WarpSize.resize(NumberOfDevices);
NumTeams.resize(NumberOfDevices);
NumThreads.resize(NumberOfDevices);
DeviceStateStore.resize(NumberOfDevices);
KernelInfoTable.resize(NumberOfDevices);
SymbolInfoTable.resize(NumberOfDevices);
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
Err = setupDevicePools(HSAAgents);
if (Err != HSA_STATUS_SUCCESS) {
DP("Setup for Device Memory Pools failed\n");
return;
}
Err = setupHostMemoryPools(CPUAgents);
if (Err != HSA_STATUS_SUCCESS) {
DP("Setup for Host Memory Pools failed\n");
return;
}
for (int I = 0; I < NumberOfDevices; I++) {
uint32_t QueueSize = 0;
{
hsa_status_t Err = hsa_agent_get_info(
HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
if (Err != HSA_STATUS_SUCCESS) {
DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
return;
}
enum { MaxQueueSize = 4096 };
if (QueueSize > MaxQueueSize) {
QueueSize = MaxQueueSize;
}
}
{
HSAQueueScheduler QSched;
if (!QSched.createQueues(HSAAgents[I], QueueSize))
return;
HSAQueueSchedulers.emplace_back(std::move(QSched));
}
DeviceStateStore[I] = {nullptr, 0};
}
for (int I = 0; I < NumberOfDevices; I++) {
ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
ComputeUnits[I] = 1;
DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
GroupsPerDevice[I], ThreadsPerGroup[I]);
}
// Get environment variables regarding teams
Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
Env.NumTeams = readEnv("OMP_NUM_TEAMS");
Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
// Default state.
RequiresFlags = OMP_REQ_UNDEFINED;
ConstructionSucceeded = true;
}
~RTLDeviceInfoTy() {
DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
if (!HSAInitSuccess()) {
// Then none of these can have been set up and they can't be torn down
return;
}
// Run destructors on types that use HSA before
// impl_finalize removes access to it
DeviceStateStore.clear();
KernelArgPoolMap.clear();
// Terminate hostrpc before finalizing hsa
hostrpc_terminate();
hsa_status_t Err;
for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
Err = hsa_executable_destroy(HSAExecutables[I]);
if (Err != HSA_STATUS_SUCCESS) {
DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
"Destroying executable", get_error_string(Err));
}
}
}
};
pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
static RTLDeviceInfoTy DeviceInfo;
namespace {
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
// Return success if we are not copying back to host from target.
if (!HstPtr)
return OFFLOAD_SUCCESS;
hsa_status_t Err;
DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)TgtPtr,
(long long unsigned)(Elf64_Addr)HstPtr);
Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
DeviceId);
if (Err != HSA_STATUS_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
(Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
return OFFLOAD_FAIL;
}
DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)TgtPtr,
(long long unsigned)(Elf64_Addr)HstPtr);
return OFFLOAD_SUCCESS;
}
int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
__tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
hsa_status_t Err;
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
// Return success if we are not doing host to target.
if (!HstPtr)
return OFFLOAD_SUCCESS;
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)HstPtr,
(long long unsigned)(Elf64_Addr)TgtPtr);
Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
DeviceId);
if (Err != HSA_STATUS_SUCCESS) {
DP("Error when copying data from host to device. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
(Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
// Async.
// The implementation was written with cuda streams in mind. The semantics of
// that are to execute kernels on a queue in order of insertion. A synchronise
// call then makes writes visible between host and device. This means a series
// of N data_submit_async calls are expected to execute serially. HSA offers
// various options to run the data copies concurrently. This may require changes
// to libomptarget.
// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
// there are no outstanding kernels that need to be synchronized. Any async call
// may be passed a Queue==0, at which point the cuda implementation will set it
// to non-null (see getStream). The cuda streams are per-device. Upstream may
// change this interface to explicitly initialize the AsyncInfo_pointer, but
// until then hsa lazily initializes it as well.
void initAsyncInfo(__tgt_async_info *AsyncInfo) {
// set non-null while using async calls, return to null to indicate completion
assert(AsyncInfo);
if (!AsyncInfo->Queue) {
AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
}
}
void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
assert(AsyncInfo);
assert(AsyncInfo->Queue);
AsyncInfo->Queue = 0;
}
// Determine launch values for kernel.
struct LaunchVals {
int WorkgroupSize;
int GridSize;
};
LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
int ConstWGSize,
llvm::omp::OMPTgtExecModeFlags ExecutionMode,
int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
int DeviceNumTeams) {
int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
int NumGroups = 0;
int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
if (print_kernel_trace & STARTUP_DETAILS) {
DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
DP("Max_Teams: %d\n", MaxTeams);
DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
RTLDeviceInfoTy::DefaultWgSize);
DP("thread_limit: %d\n", ThreadLimit);
DP("threadsPerGroup: %d\n", ThreadsPerGroup);
DP("ConstWGSize: %d\n", ConstWGSize);
}
// check for thread_limit() clause
if (ThreadLimit > 0) {
ThreadsPerGroup = ThreadLimit;
DP("Setting threads per block to requested %d\n", ThreadLimit);
// Add master warp for GENERIC
if (ExecutionMode ==
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
ThreadsPerGroup += WarpSize;
DP("Adding master wavefront: +%d threads\n", WarpSize);
}
if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
}
}
// check flat_max_work_group_size attr here
if (ThreadsPerGroup > ConstWGSize) {
ThreadsPerGroup = ConstWGSize;
DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
ThreadsPerGroup);
}
if (print_kernel_trace & STARTUP_DETAILS)
DP("threadsPerGroup: %d\n", ThreadsPerGroup);
DP("Preparing %d threads\n", ThreadsPerGroup);
// Set default num_groups (teams)
if (Env.TeamLimit > 0)
NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
else
NumGroups = MaxTeams;
DP("Set default num of groups %d\n", NumGroups);
if (print_kernel_trace & STARTUP_DETAILS) {
DP("num_groups: %d\n", NumGroups);
DP("num_teams: %d\n", NumTeams);
}
// Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
// This reduction is typical for default case (no thread_limit clause).
// or when user goes crazy with num_teams clause.
// FIXME: We cant distinguish between a constant or variable thread limit.
// So we only handle constant thread_limits.
if (ThreadsPerGroup >
RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
// Should we round threadsPerGroup up to nearest WarpSize
// here?
NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
// check for num_teams() clause
if (NumTeams > 0) {
NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
}
if (print_kernel_trace & STARTUP_DETAILS) {
DP("num_groups: %d\n", NumGroups);
DP("Env.NumTeams %d\n", Env.NumTeams);
DP("Env.TeamLimit %d\n", Env.TeamLimit);
}
if (Env.NumTeams > 0) {
NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
} else if (Env.TeamLimit > 0) {
NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
} else {
if (NumTeams <= 0) {
if (LoopTripcount > 0) {
if (ExecutionMode ==
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
// round up to the nearest integer
NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
} else if (ExecutionMode ==
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
NumGroups = LoopTripcount;
} else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
// This is a generic kernel that was transformed to use SPMD-mode
// execution but uses Generic-mode semantics for scheduling.
NumGroups = LoopTripcount;
}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n",
NumGroups, LoopTripcount, ThreadsPerGroup);
}
} else {
NumGroups = NumTeams;
}
if (NumGroups > MaxTeams) {
NumGroups = MaxTeams;
if (print_kernel_trace & STARTUP_DETAILS)
DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
}
if (NumGroups > NumTeams && NumTeams > 0) {
NumGroups = NumTeams;
if (print_kernel_trace & STARTUP_DETAILS)
DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
NumTeams);
}
}
// num_teams clause always honored, no matter what, unless DEFAULT is active.
if (NumTeams > 0) {
NumGroups = NumTeams;
// Cap num_groups to EnvMaxTeamsDefault if set.
if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
NumGroups = Env.MaxTeamsDefault;
}
if (print_kernel_trace & STARTUP_DETAILS) {
DP("threadsPerGroup: %d\n", ThreadsPerGroup);
DP("num_groups: %d\n", NumGroups);
DP("loop_tripcount: %ld\n", LoopTripcount);
}
DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
ThreadsPerGroup);
LaunchVals Res;
Res.WorkgroupSize = ThreadsPerGroup;
Res.GridSize = ThreadsPerGroup * NumGroups;
return Res;
}
static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
bool Full = true;
while (Full) {
Full =
PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
}
return PacketId;
}
int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
int32_t ThreadLimit, uint64_t LoopTripcount) {
// Set the context we are using
// update thread limit content in gpu memory if un-initialized or specified
// from host
DP("Run target team region thread_limit %d\n", ThreadLimit);
// All args are references.
std::vector<void *> Args(ArgNum);
std::vector<void *> Ptrs(ArgNum);
DP("Arg_num: %d\n", ArgNum);
for (int32_t I = 0; I < ArgNum; ++I) {
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
Args[I] = &Ptrs[I];
DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
}
KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
std::string KernelName = std::string(KernelInfo->Name);
auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
if (KernelInfoTable[DeviceId].find(KernelName) ==
KernelInfoTable[DeviceId].end()) {
DP("Kernel %s not found\n", KernelName.c_str());
return OFFLOAD_FAIL;
}
const atl_kernel_info_t KernelInfoEntry =
KernelInfoTable[DeviceId][KernelName];
const uint32_t GroupSegmentSize =
KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize;
const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
/*
* Set limit based on ThreadsPerGroup and GroupsPerDevice
*/
LaunchVals LV =
getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env,
KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
NumTeams, // From run_region arg
ThreadLimit, // From run_region arg
LoopTripcount, // From run_region arg
DeviceInfo.NumTeams[KernelInfo->DeviceId]);
const int GridSize = LV.GridSize;
const int WorkgroupSize = LV.WorkgroupSize;
if (print_kernel_trace >= LAUNCH) {
int NumGroups = GridSize / WorkgroupSize;
// enum modes are SPMD, GENERIC, NONE 0,1,2
// if doing rtl timing, print to stderr, unless stdout requested.
bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
fprintf(TraceToStdout ? stdout : stderr,
"DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
VgprSpillCount, LoopTripcount, KernelInfo->Name);
}
// Run on the device.
{
hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next();
if (!Queue) {
return OFFLOAD_FAIL;
}
uint64_t PacketId = acquireAvailablePacketId(Queue);
const uint32_t Mask = Queue->size - 1; // size is a power of 2
hsa_kernel_dispatch_packet_t *Packet =
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
// packet->header is written last
Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = WorkgroupSize;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
Packet->reserved0 = 0;
Packet->grid_size_x = GridSize;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size = KernelInfoEntry.private_segment_size;
Packet->group_segment_size = GroupSegmentSize;
Packet->kernel_object = KernelInfoEntry.kernel_object;
Packet->kernarg_address = 0; // use the block allocator
Packet->reserved2 = 0; // impl writes id_ here
Packet->completion_signal = {0}; // may want a pool of signals
KernelArgPool *ArgPool = nullptr;
void *KernArg = nullptr;
{
auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
if (It != KernelArgPoolMap.end()) {
ArgPool = (It->second).get();
}
}
if (!ArgPool) {
DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
DeviceId);
}
{
if (ArgPool) {
assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
KernArg = ArgPool->allocate(ArgNum);
}
if (!KernArg) {
DP("Allocate kernarg failed\n");
return OFFLOAD_FAIL;
}
// Copy explicit arguments
for (int I = 0; I < ArgNum; I++) {
memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
}
// Initialize implicit arguments. TODO: Which of these can be dropped
impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
memset(ImplArgs, 0,
sizeof(impl_implicit_args_t)); // may not be necessary
ImplArgs->offset_x = 0;
ImplArgs->offset_y = 0;
ImplArgs->offset_z = 0;
// assign a hostcall buffer for the selected Q
if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) {
// hostrpc_assign_buffer is not thread safe, and this function is
// under a multiple reader lock, not a writer lock.
static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&HostcallInitLock);
uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId],
Queue, DeviceId);
pthread_mutex_unlock(&HostcallInitLock);
if (!Buffer) {
DP("hostrpc_assign_buffer failed, gpu would dereference null and "
"error\n");
return OFFLOAD_FAIL;
}
DP("Implicit argument count: %d\n",
KernelInfoEntry.implicit_argument_count);
if (KernelInfoEntry.implicit_argument_count >= 4) {
// Initialise pointer for implicit_argument_count != 0 ABI
// Guess that the right implicit argument is at offset 24 after
// the explicit arguments. In the future, should be able to read
// the offset from msgpack. Clang is not annotating it at present.
uint64_t Offset =
sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
"args: %d\n",
Offset + 8, ArgPool->kernargSizeIncludingImplicit());
} else {
memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
}
}
// initialise pointer for implicit_argument_count == 0 ABI
ImplArgs->hostcall_ptr = Buffer;
}
Packet->kernarg_address = KernArg;
}
hsa_signal_t S = DeviceInfo.FreeSignalPool.pop();
if (S.handle == 0) {
DP("Failed to get signal instance\n");
return OFFLOAD_FAIL;
}
Packet->completion_signal = S;
hsa_signal_store_relaxed(Packet->completion_signal, 1);
// Publish the packet indicating it is ready to be processed
core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
core::createHeader(), Packet->setup);
// Since the packet is already published, its contents must not be
// accessed any more
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED) != 0)
;
assert(ArgPool);
ArgPool->deallocate(KernArg);
DeviceInfo.FreeSignalPool.push(S);
}
DP("Kernel completed\n");
return OFFLOAD_SUCCESS;
}
bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
int32_t R = elf_check_machine(Image, AmdgcnMachineID);
if (!R) {
DP("Supported machine ID not found\n");
}
return R;
}
uint32_t elfEFlags(__tgt_device_image *Image) {
char *ImgBegin = (char *)Image->ImageStart;
size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
Elf *E = elf_memory(ImgBegin, ImgSize);
if (!E) {
DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
return 0;
}
Elf64_Ehdr *Eh64 = elf64_getehdr(E);
if (!Eh64) {
DP("Unable to get machine ID from ELF file!\n");
elf_end(E);
return 0;
}
uint32_t Flags = Eh64->e_flags;
elf_end(E);
DP("ELF Flags: 0x%x\n", Flags);
return Flags;
}
template <typename T> bool enforceUpperBound(T *Value, T Upper) {
bool Changed = *Value > Upper;
if (Changed) {
*Value = Upper;
}
return Changed;
}
Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
size_t N;
int Rc = elf_getshdrnum(Elf, &N);
if (Rc != 0) {
return nullptr;
}
Elf64_Shdr *Result = nullptr;
for (size_t I = 0; I < N; I++) {
Elf_Scn *Scn = elf_getscn(Elf, I);
if (Scn) {
Elf64_Shdr *Shdr = elf64_getshdr(Scn);
if (Shdr) {
if (Shdr->sh_type == SHT_HASH) {
if (Result == nullptr) {
Result = Shdr;
} else {
// multiple SHT_HASH sections not handled
return nullptr;
}
}
}
}
}
return Result;
}
const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
const char *Symname) {
assert(SectionHash);
size_t SectionSymtabIndex = SectionHash->sh_link;
Elf64_Shdr *SectionSymtab =
elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
size_t SectionStrtabIndex = SectionSymtab->sh_link;
const Elf64_Sym *Symtab =
reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);
const uint32_t *Hashtab =
reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);
// Layout:
// nbucket
// nchain
// bucket[nbucket]
// chain[nchain]
uint32_t Nbucket = Hashtab[0];
const uint32_t *Bucket = &Hashtab[2];
const uint32_t *Chain = &Hashtab[Nbucket + 2];
const size_t Max = strlen(Symname) + 1;
const uint32_t Hash = elf_hash(Symname);
for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
if (strncmp(Symname, N, Max) == 0) {
return &Symtab[I];
}
}
return nullptr;
}
struct SymbolInfo {
void *Addr = nullptr;
uint32_t Size = UINT32_MAX;
uint32_t ShType = SHT_NULL;
};
int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
SymbolInfo *Res) {
if (elf_kind(Elf) != ELF_K_ELF) {
return 1;
}
Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
if (!SectionHash) {
return 1;
}
const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
if (!Sym) {
return 1;
}
if (Sym->st_size > UINT32_MAX) {
return 1;
}
if (Sym->st_shndx == SHN_UNDEF) {
return 1;
}
Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
if (!Section) {
return 1;
}
Elf64_Shdr *Header = elf64_getshdr(Section);
if (!Header) {
return 1;
}
Res->Addr = Sym->st_value + Base;
Res->Size = static_cast<uint32_t>(Sym->st_size);
Res->ShType = Header->sh_type;
return 0;
}
int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
SymbolInfo *Res) {
Elf *Elf = elf_memory(Base, ImgSize);
if (Elf) {
int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
elf_end(Elf);
return Rc;
}
return 1;
}
hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
const char *SymName, void **VarAddr,
uint32_t *VarSize) {
SymbolInfo SI;
int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
if (Rc == 0) {
*VarAddr = SI.Addr;
*VarSize = SI.Size;
return HSA_STATUS_SUCCESS;
}
return HSA_STATUS_ERROR;
}
template <typename C>
hsa_status_t moduleRegisterFromMemoryToPlace(
std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
std::vector<hsa_executable_t> &HSAExecutables) {
auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
C *Unwrapped = static_cast<C *>(CbState);
return (*Unwrapped)(Data, Size);
};
return core::RegisterModuleFromMemory(
KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
HSAExecutables);
}
uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
uint64_t DeviceStateBytes = 0;
{
// If this is the deviceRTL, get the state variable size
SymbolInfo SizeSi;
int Rc = getSymbolInfoWithoutLoading(
ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
if (Rc == 0) {
if (SizeSi.Size != sizeof(uint64_t)) {
DP("Found device_State_size variable with wrong size\n");
return 0;
}
// Read number of bytes directly from the elf
memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
}
}
return DeviceStateBytes;
}
struct DeviceEnvironment {
// initialise an DeviceEnvironmentTy in the deviceRTL
// patches around differences in the deviceRTL between trunk, aomp,
// rocmcc. Over time these differences will tend to zero and this class
// simplified.
// Symbol may be in .data or .bss, and may be missing fields, todo:
// review aomp/trunk/rocm and simplify the following
// The symbol may also have been deadstripped because the device side
// accessors were unused.
// If the symbol is in .data (aomp, rocm) it can be written directly.
// If it is in .bss, we must wait for it to be allocated space on the
// gpu (trunk) and initialize after loading.
const char *sym() { return "omptarget_device_environment"; }
DeviceEnvironmentTy HostDeviceEnv;
SymbolInfo SI;
bool Valid = false;
__tgt_device_image *Image;
const size_t ImgSize;
DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
__tgt_device_image *Image, const size_t ImgSize)
: Image(Image), ImgSize(ImgSize) {
HostDeviceEnv.NumDevices = NumberDevices;
HostDeviceEnv.DeviceNum = DeviceId;
HostDeviceEnv.DebugKind = 0;
HostDeviceEnv.DynamicMemSize = DynamicMemSize;
if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
HostDeviceEnv.DebugKind = std::stoi(EnvStr);
int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
sym(), &SI);
if (Rc != 0) {
DP("Finding global device environment '%s' - symbol missing.\n", sym());
return;
}
if (SI.Size > sizeof(HostDeviceEnv)) {
DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
sizeof(HostDeviceEnv));
return;
}
Valid = true;
}
bool inImage() { return SI.ShType != SHT_NOBITS; }
hsa_status_t beforeLoading(void *Data, size_t Size) {
if (Valid) {
if (inImage()) {
DP("Setting global device environment before load (%u bytes)\n",
SI.Size);
uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
void *Pos = (char *)Data + Offset;
memcpy(Pos, &HostDeviceEnv, SI.Size);
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t afterLoading() {
if (Valid) {
if (!inImage()) {
DP("Setting global device environment after load (%u bytes)\n",
SI.Size);
int DeviceId = HostDeviceEnv.DeviceNum;
auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
void *StatePtr;
uint32_t StatePtrSize;
hsa_status_t Err = interop_hsa_get_symbol_info(
SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
if (Err != HSA_STATUS_SUCCESS) {
DP("failed to find %s in loaded image\n", sym());
return Err;
}
if (StatePtrSize != SI.Size) {
DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
SI.Size);
return HSA_STATUS_ERROR;
}
return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
StatePtrSize, DeviceId);
}
}
return HSA_STATUS_SUCCESS;
}
};
hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
uint64_t Rounded = 4 * ((Size + 3) / 4);
void *Ptr;
hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
if (Err != HSA_STATUS_SUCCESS) {
return Err;
}
hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
if (Rc != HSA_STATUS_SUCCESS) {
DP("zero fill device_state failed with %u\n", Rc);
core::Runtime::Memfree(Ptr);
return HSA_STATUS_ERROR;
}
*RetPtr = Ptr;
return HSA_STATUS_SUCCESS;
}
bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
SymbolInfo SI;
int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
return (Rc == 0) && (SI.Addr != nullptr);
}
} // namespace
namespace core {
hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
&DeviceInfo.HSAAgents[0], NULL, Ptr);
}
} // namespace core
static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
hsa_status_t err;
uint32_t name_len;
err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
if (err != HSA_STATUS_SUCCESS) {
DP("Error getting ISA info length\n");
return err;
}
char TargetID[name_len];
err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
if (err != HSA_STATUS_SUCCESS) {
DP("Error getting ISA info name\n");
return err;
}
auto TripleTargetID = llvm::StringRef(TargetID);
if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
DeviceInfo.TargetID.push_back(TripleTargetID.ltrim('-').str());
}
return HSA_STATUS_SUCCESS;
}
/// Parse a TargetID to get processor arch and feature map.
/// Returns processor subarch.
/// Returns TargetID features in \p FeatureMap argument.
/// If the \p TargetID contains feature+, FeatureMap it to true.
/// If the \p TargetID contains feature-, FeatureMap it to false.
/// If the \p TargetID does not contain a feature (default), do not map it.
StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
if (TargetID.empty())
return llvm::StringRef();
auto ArchFeature = TargetID.split(":");
auto Arch = ArchFeature.first;
auto Features = ArchFeature.second;
if (Features.empty())
return Arch;
if (Features.contains("sramecc+")) {
FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
} else if (Features.contains("sramecc-")) {
FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
}
if (Features.contains("xnack+")) {
FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
} else if (Features.contains("xnack-")) {
FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
}
return Arch;
}
/// Checks if an image \p ImgInfo is compatible with current
/// system's environment \p EnvInfo
bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
// Compatible in case of exact match
if (ImgTID == EnvTID) {
DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",
ImgTID.data(), EnvTID.data());
return true;
}
// Incompatible if Archs mismatch.
StringMap<bool> ImgMap, EnvMap;
StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
// Both EnvArch and ImgArch can't be empty here.
if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",
ImgTID.data(), EnvTID.data());
return false;
}
// Incompatible if image has more features than the environment, irrespective
// of type or sign of features.
if (ImgMap.size() > EnvMap.size()) {
DP("Incompatible: Image has more features than the environment \t[Image: "
"%s]\t:\t[Environment: %s]\n",
ImgTID.data(), EnvTID.data());
return false;
}
// Compatible if each target feature specified by the environment is
// compatible with target feature of the image. The target feature is
// compatible if the iamge does not specify it (meaning Any), or if it
// specifies it with the same value (meaning On or Off).
for (const auto &ImgFeature : ImgMap) {
auto EnvFeature = EnvMap.find(ImgFeature.first());
if (EnvFeature == EnvMap.end()) {
DP("Incompatible: Value of Image's non-ANY feature is not matching with "
"the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "
"%s]\n",
ImgTID.data(), EnvTID.data());
return false;
} else if (EnvFeature->first() == ImgFeature.first() &&
EnvFeature->second != ImgFeature.second) {
DP("Incompatible: Value of Image's non-ANY feature is not matching with "
"the Environment feature's non-ANY value \t[Image: "
"%s]\t:\t[Environment: %s]\n",
ImgTID.data(), EnvTID.data());
return false;
}
}
// Image is compatible if all features of Environment are:
// - either, present in the Image's features map with the same sign,
// - or, the feature is missing from Image's features map i.e. it is
// set to ANY
DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "
"%s]\n",
ImgTID.data(), EnvTID.data());
return true;
}
extern "C" {
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
return elfMachineIdIsAmdgcn(Image);
}
int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
__tgt_image_info *info) {
if (!__tgt_rtl_is_valid_binary(image))
return false;
// A subarchitecture was not specified. Assume it is compatible.
if (!info->Arch)
return true;
int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
__tgt_rtl_init_device(DeviceId);
hsa_agent_t agent = DeviceInfo.HSAAgents[DeviceId];
hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
if (err != HSA_STATUS_SUCCESS) {
DP("Error iterating ISAs\n");
return false;
}
if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo.TargetID[DeviceId]))
return false;
}
DP("Image has Target ID compatible with the current environment: %s\n",
info->Arch);
return true;
}
int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
int __tgt_rtl_number_of_devices() {
// If the construction failed, no methods are safe to call
if (DeviceInfo.ConstructionSucceeded) {
return DeviceInfo.NumberOfDevices;
}
DP("AMDGPU plugin construction failed. Zero devices available\n");
return 0;
}
int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
DP("Init requires flags to %ld\n", RequiresFlags);
DeviceInfo.RequiresFlags = RequiresFlags;
return RequiresFlags;
}
int32_t __tgt_rtl_init_device(int DeviceId) {
hsa_status_t Err = hsa_init();
if (Err != HSA_STATUS_SUCCESS) {
DP("HSA Initialization Failed.\n");
return HSA_STATUS_ERROR;
}
// this is per device id init
DP("Initialize the device id: %d\n", DeviceId);
hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId];
// Get number of Compute Unit
uint32_t ComputeUnits = 0;
Err = hsa_agent_get_info(
Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
&ComputeUnits);
if (Err != HSA_STATUS_SUCCESS) {
DeviceInfo.ComputeUnits[DeviceId] = 1;
DP("Error getting compute units : settiing to 1\n");
} else {
DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits;
DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]);
}
char GetInfoName[64]; // 64 max size returned by get info
Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
(void *)GetInfoName);
if (Err)
DeviceInfo.GPUName[DeviceId] = "--unknown gpu--";
else {
DeviceInfo.GPUName[DeviceId] = GetInfoName;
}
if (print_kernel_trace & STARTUP_DETAILS)
DP("Device#%-2d CU's: %2d %s\n", DeviceId,
DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str());
// Query attributes to determine number of threads/block and blocks/grid.
uint16_t WorkgroupMaxDim[3];
Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
&WorkgroupMaxDim);
if (Err != HSA_STATUS_SUCCESS) {
DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
DP("Error getting grid dims: num groups : %d\n",
RTLDeviceInfoTy::DefaultNumTeams);
} else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]);
} else {
DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
"at the hard limit\n",
WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
}
// Get thread limit
hsa_dim3_t GridMaxDim;
Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
if (Err == HSA_STATUS_SUCCESS) {
DeviceInfo.ThreadsPerGroup[DeviceId] =
reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
DeviceInfo.GroupsPerDevice[DeviceId];
if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) {
DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
} else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId],
RTLDeviceInfoTy::MaxWgSize)) {
DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
} else {
DP("Using ROCm Queried thread limit: %d\n",
DeviceInfo.ThreadsPerGroup[DeviceId]);
}
} else {
DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
DP("Error getting max block dimension, use default:%d \n",
RTLDeviceInfoTy::MaxWgSize);
}
// Get wavefront size
uint32_t WavefrontSize = 0;
Err =
hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
if (Err == HSA_STATUS_SUCCESS) {
DP("Queried wavefront size: %d\n", WavefrontSize);
DeviceInfo.WarpSize[DeviceId] = WavefrontSize;
} else {
// TODO: Burn the wavefront size into the code object
DP("Warning: Unknown wavefront size, assuming 64\n");
DeviceInfo.WarpSize[DeviceId] = 64;
}
// Adjust teams to the env variables
if (DeviceInfo.Env.TeamLimit > 0 &&
(enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId],
DeviceInfo.Env.TeamLimit))) {
DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
DeviceInfo.Env.TeamLimit);
}
// Set default number of teams
if (DeviceInfo.Env.NumTeams > 0) {
DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams;
DP("Default number of teams set according to environment %d\n",
DeviceInfo.Env.NumTeams);
} else {
char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
int TeamsPerCU = DefaultTeamsPerCU;
if (TeamsPerCUEnvStr) {
TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
}
DeviceInfo.NumTeams[DeviceId] =
TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId];
DP("Default number of teams = %d * number of compute units %d\n",
TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]);
}
if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId],
DeviceInfo.GroupsPerDevice[DeviceId])) {
DP("Default number of teams exceeds device limit, capping at %d\n",
DeviceInfo.GroupsPerDevice[DeviceId]);
}
// Adjust threads to the env variables
if (DeviceInfo.Env.TeamThreadLimit > 0 &&
(enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
DeviceInfo.Env.TeamThreadLimit))) {
DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
DeviceInfo.Env.TeamThreadLimit);
}
// Set default number of threads
DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
DP("Default number of threads set according to library's default %d\n",
RTLDeviceInfoTy::DefaultWgSize);
if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
DeviceInfo.ThreadsPerGroup[DeviceId])) {
DP("Default number of threads exceeds device limit, capping at %d\n",
DeviceInfo.ThreadsPerGroup[DeviceId]);
}
DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
DeviceId, DeviceInfo.GroupsPerDevice[DeviceId],
DeviceInfo.ThreadsPerGroup[DeviceId]);
DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId],
DeviceInfo.GroupsPerDevice[DeviceId],
DeviceInfo.GroupsPerDevice[DeviceId] *
DeviceInfo.ThreadsPerGroup[DeviceId]);
return OFFLOAD_SUCCESS;
}
static __tgt_target_table *
__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
__tgt_device_image *Image) {
DeviceInfo.LoadRunLock.lock();
__tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
DeviceInfo.LoadRunLock.unlock();
return Res;
}
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
__tgt_device_image *Image) {
// This function loads the device image onto gpu[DeviceId] and does other
// per-image initialization work. Specifically:
//
// - Initialize an DeviceEnvironmentTy instance embedded in the
// image at the symbol "omptarget_device_environment"
// Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
//
// - Allocate a large array per-gpu (could be moved to init_device)
// - Read a uint64_t at symbol omptarget_nvptx_device_State_size
// - Allocate at least that many bytes of gpu memory
// - Zero initialize it
// - Write the pointer to the symbol omptarget_nvptx_device_State
//
// - Pulls some per-kernel information together from various sources and
// records it in the KernelsList for quicker access later
//
// The initialization can be done before or after loading the image onto the
// gpu. This function presently does a mixture. Using the hsa api to get/set
// the information is simpler to implement, in exchange for more complicated
// runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
// back from the gpu vs a hashtable lookup on the host.
const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
DeviceInfo.clearOffloadEntriesTable(DeviceId);
// We do not need to set the ELF version because the caller of this function
// had to do that to decide the right runtime to use
if (!elfMachineIdIsAmdgcn(Image))
return NULL;
{
auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices,
DeviceInfo.Env.DynamicMemSize, Image, ImgSize);
auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId];
auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
hsa_status_t Err = moduleRegisterFromMemoryToPlace(
KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
[&](void *Data, size_t Size) {
if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
__atomic_store_n(&DeviceInfo.HostcallRequired, true,
__ATOMIC_RELEASE);
}
return Env.beforeLoading(Data, Size);
},
DeviceInfo.HSAExecutables);
check("Module registering", Err);
if (Err != HSA_STATUS_SUCCESS) {
const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str();
const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
if (strcmp(DeviceName, ElfName) != 0) {
DP("Possible gpu arch mismatch: device:%s, image:%s please check"
" compiler flag: -march=<gpu>\n",
DeviceName, ElfName);
} else {
DP("Error loading image onto GPU: %s\n", get_error_string(Err));
}
return NULL;
}
Err = Env.afterLoading();
if (Err != HSA_STATUS_SUCCESS) {
return NULL;
}
}
DP("AMDGPU module successfully loaded!\n");
{
// the device_State array is either large value in bss or a void* that
// needs to be assigned to a pointer to an array of size device_state_bytes
// If absent, it has been deadstripped and needs no setup.
void *StatePtr;
uint32_t StatePtrSize;
auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
hsa_status_t Err = interop_hsa_get_symbol_info(
SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
&StatePtrSize);
if (Err != HSA_STATUS_SUCCESS) {
DP("No device_state symbol found, skipping initialization\n");
} else {
if (StatePtrSize < sizeof(void *)) {
DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
sizeof(void *));
return NULL;
}
// if it's larger than a void*, assume it's a bss array and no further
// initialization is required. Only try to set up a pointer for
// sizeof(void*)
if (StatePtrSize == sizeof(void *)) {
uint64_t DeviceStateBytes =
getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
if (DeviceStateBytes == 0) {
DP("Can't initialize device_State, missing size information\n");
return NULL;
}
auto &DSS = DeviceInfo.DeviceStateStore[DeviceId];
if (DSS.first.get() == nullptr) {
assert(DSS.second == 0);
void *Ptr = NULL;
hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
if (Err != HSA_STATUS_SUCCESS) {
DP("Failed to allocate device_state array\n");
return NULL;
}
DSS = {
std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
DeviceStateBytes,
};
}
void *Ptr = DSS.first.get();
if (DeviceStateBytes != DSS.second) {
DP("Inconsistent sizes of device_State unsupported\n");
return NULL;
}
// write ptr to device memory so it can be used by later kernels
Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
DeviceId);
if (Err != HSA_STATUS_SUCCESS) {
DP("memcpy install of state_ptr failed\n");
return NULL;
}
}
}
}
// Here, we take advantage of the data that is appended after img_end to get
// the symbols' name we need to load. This data consist of the host entries
// begin and end as well as the target name (see the offloading linker script
// creation in clang compiler).
// Find the symbols in the module by name. The name can be obtain by
// concatenating the host entry name with the target name
__tgt_offload_entry *HostBegin = Image->EntriesBegin;
__tgt_offload_entry *HostEnd = Image->EntriesEnd;
for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
if (!E->addr) {
// The host should have always something in the address to
// uniquely identify the target region.
DP("Analyzing host entry '<null>' (size = %lld)...\n",
(unsigned long long)E->size);
return NULL;
}
if (E->size) {
__tgt_offload_entry Entry = *E;
void *Varptr;
uint32_t Varsize;
auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
hsa_status_t Err = interop_hsa_get_symbol_info(
SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
if (Err != HSA_STATUS_SUCCESS) {
// Inform the user what symbol prevented offloading
DP("Loading global '%s' (Failed)\n", E->name);
return NULL;
}
if (Varsize != E->size) {
DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
Varsize, E->size);
return NULL;
}
DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
Entry.addr = (void *)Varptr;
DeviceInfo.addOffloadEntry(DeviceId, Entry);
if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
E->flags & OMP_DECLARE_TARGET_LINK) {
// If unified memory is present any target link variables
// can access host addresses directly. There is no longer a
// need for device copies.
Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr,
sizeof(void *), DeviceId);
if (Err != HSA_STATUS_SUCCESS)
DP("Error when copying USM\n");
DP("Copy linked variable host address (" DPxMOD ")"
"to device address (" DPxMOD ")\n",
DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
}
continue;
}
DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
// errors in kernarg_segment_size previously treated as = 0 (or as undef)
uint32_t KernargSegmentSize = 0;
auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId];
hsa_status_t Err = HSA_STATUS_SUCCESS;
if (!E->name) {
Err = HSA_STATUS_ERROR;
} else {
std::string KernelStr = std::string(E->name);
auto It = KernelInfoMap.find(KernelStr);
if (It != KernelInfoMap.end()) {
atl_kernel_info_t Info = It->second;
KernargSegmentSize = Info.kernel_segment_size;
} else {
Err = HSA_STATUS_ERROR;
}
}
// default value GENERIC (in case symbol is missing from cubin file)
llvm::omp::OMPTgtExecModeFlags ExecModeVal =
llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
// get flat group size if present, else Default_WG_Size
int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
// get Kernel Descriptor if present.
// Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
struct KernDescValType {
uint16_t Version;
uint16_t TSize;
uint16_t WGSize;
};
struct KernDescValType KernDescVal;
std::string KernDescNameStr(E->name);
KernDescNameStr += "_kern_desc";
const char *KernDescName = KernDescNameStr.c_str();
void *KernDescPtr;
uint32_t KernDescSize;
void *CallStackAddr = nullptr;
Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
&KernDescPtr, &KernDescSize);
if (Err == HSA_STATUS_SUCCESS) {
if ((size_t)KernDescSize != sizeof(KernDescVal))
DP("Loading global computation properties '%s' - size mismatch (%u != "
"%lu)\n",
KernDescName, KernDescSize, sizeof(KernDescVal));
memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
// Check structure size against recorded size.
if ((size_t)KernDescSize != KernDescVal.TSize)
DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
DP("After loading global for %s KernDesc \n", KernDescName);
DP("KernDesc: Version: %d\n", KernDescVal.Version);
DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
if (KernDescVal.WGSize == 0) {
KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
}
WGSizeVal = KernDescVal.WGSize;
DP("WGSizeVal %d\n", WGSizeVal);
check("Loading KernDesc computation property", Err);
} else {
DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
// Flat group size
std::string WGSizeNameStr(E->name);
WGSizeNameStr += "_wg_size";
const char *WGSizeName = WGSizeNameStr.c_str();
void *WGSizePtr;
uint32_t WGSize;
Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
&WGSizePtr, &WGSize);
if (Err == HSA_STATUS_SUCCESS) {
if ((size_t)WGSize != sizeof(int16_t)) {
DP("Loading global computation properties '%s' - size mismatch (%u "
"!= "
"%lu)\n",
WGSizeName, WGSize, sizeof(int16_t));
return NULL;
}
memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
DP("Error wrong WGSize value specified in HSA code object file: "
"%d\n",
WGSizeVal);
WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
}
} else {
DP("Warning: Loading WGSize '%s' - symbol not found, "
"using default value %d\n",
WGSizeName, WGSizeVal);
}
check("Loading WGSize computation property", Err);
}
// Read execution mode from global in binary
std::string ExecModeNameStr(E->name);
ExecModeNameStr += "_exec_mode";
const char *ExecModeName = ExecModeNameStr.c_str();
void *ExecModePtr;
uint32_t VarSize;
Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
&ExecModePtr, &VarSize);
if (Err == HSA_STATUS_SUCCESS) {
if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
DP("Loading global computation properties '%s' - size mismatch(%u != "
"%lu)\n",
ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
return NULL;
}
memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
DP("After loading global for %s ExecMode = %d\n", ExecModeName,
ExecModeVal);
if (ExecModeVal < 0 ||
ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
DP("Error wrong exec_mode value specified in HSA code object file: "
"%d\n",
ExecModeVal);
return NULL;
}
} else {
DP("Loading global exec_mode '%s' - symbol missing, using default "
"value "
"GENERIC (1)\n",
ExecModeName);
}
check("Loading computation property", Err);
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
CallStackAddr, E->name, KernargSegmentSize,
DeviceInfo.KernArgPool));
__tgt_offload_entry Entry = *E;
Entry.addr = (void *)&KernelsList.back();
DeviceInfo.addOffloadEntry(DeviceId, Entry);
DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
}
return DeviceInfo.getOffloadEntriesTable(DeviceId);
}
void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
void *Ptr = NULL;
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
if (Kind != TARGET_ALLOC_DEFAULT) {
REPORT("Invalid target data allocation kind or requested allocator not "
"implemented yet\n");
return NULL;
}
hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)Ptr);
Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
return Ptr;
}
int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
int64_t Size) {
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
__tgt_async_info AsyncInfo;
int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
if (Rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
}
int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
if (AsyncInfo) {
initAsyncInfo(AsyncInfo);
return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
}
return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
}
int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size) {
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
__tgt_async_info AsyncInfo;
int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
if (Rc != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
}
int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size,
__tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
initAsyncInfo(AsyncInfo);
return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
}
int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
hsa_status_t Err;
DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
Err = core::Runtime::Memfree(TgtPtr);
if (Err != HSA_STATUS_SUCCESS) {
DP("Error when freeing CUDA memory\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
int32_t ArgNum, int32_t NumTeams,
int32_t ThreadLimit,
uint64_t LoopTripcount) {
DeviceInfo.LoadRunLock.lock_shared();
int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
ArgNum, NumTeams, ThreadLimit, LoopTripcount);
DeviceInfo.LoadRunLock.unlock_shared();
return Res;
}
int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
int32_t ArgNum) {
// use one team and one thread
// fix thread num
int32_t TeamNum = 1;
int32_t ThreadLimit = 0; // use default
return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
TgtOffsets, ArgNum, TeamNum,
ThreadLimit, 0);
}
int32_t __tgt_rtl_run_target_team_region_async(
int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
initAsyncInfo(AsyncInfo);
DeviceInfo.LoadRunLock.lock_shared();
int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
ArgNum, NumTeams, ThreadLimit, LoopTripcount);
DeviceInfo.LoadRunLock.unlock_shared();
return Res;
}
int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
int32_t ArgNum,
__tgt_async_info *AsyncInfo) {
// use one team and one thread
// fix thread num
int32_t TeamNum = 1;
int32_t ThreadLimit = 0; // use default
return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
TgtOffsets, ArgNum, TeamNum,
ThreadLimit, 0, AsyncInfo);
}
int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
// Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
// is not ensured by devices.cpp for amdgcn
// assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
if (AsyncInfo->Queue) {
finiAsyncInfo(AsyncInfo);
}
return OFFLOAD_SUCCESS;
}
void __tgt_rtl_print_device_info(int32_t DeviceId) {
// TODO: Assertion to see if DeviceId is correct
// NOTE: We don't need to set context for print device info.
DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]);
}
} // extern "C"