libomptarget/plugins/amdgpu/src/rtl.cpp - llvm-project/openmp - Git at Google

 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // RTL for AMD hsa machine
 //
 //===----------------------------------------------------------------------===//

 #include <algorithm>
 #include <assert.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <libelf.h>
 #include <list>
 #include <memory>
 #include <mutex>
 #include <shared_mutex>
 #include <unordered_map>
 #include <vector>

 #include "impl_runtime.h"
 #include "interop_hsa.h"

 #include "internal.h"
 #include "rt.h"

 #include "DeviceEnvironment.h"
 #include "get_elf_mach_gfx_name.h"
 #include "omptargetplugin.h"
 #include "print_tracing.h"

 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

 using namespace llvm;

 // hostrpc interface, FIXME: consider moving to its own include these are
 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
 // linked as --whole-archive to override the weak symbols that are used to
 // implement a fallback for toolchains that do not yet have a hostrpc library.
 extern "C" {
 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
                                uint32_t DeviceId);
 hsa_status_t hostrpc_init();
 hsa_status_t hostrpc_terminate();

 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
   return HSA_STATUS_SUCCESS;
 }
 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
                                                      uint32_t DeviceId) {
   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
      "missing\n",
      DeviceId);
   return 0;
 }
 }

 // Heuristic parameters used for kernel launch
 // Number of teams per CU to allow scheduling flexibility
 static const unsigned DefaultTeamsPerCU = 4;

 int print_kernel_trace;

 #ifdef OMPTARGET_DEBUG
 #define check(msg, status)                                                     \
   if (status != HSA_STATUS_SUCCESS) {                                          \
     DP(#msg " failed\n");                                                      \
   } else {                                                                     \
     DP(#msg " succeeded\n");                                                   \
   }
 #else
 #define check(msg, status)                                                     \
   {}
 #endif

 #include "elf_common.h"

 namespace hsa {
 template <typename C> hsa_status_t iterate_agents(C Cb) {
   auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
     C *Unwrapped = static_cast<C *>(Data);
     return (*Unwrapped)(Agent);
   };
   return hsa_iterate_agents(L, static_cast<void *>(&Cb));
 }

 template <typename C>
 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
     C *Unwrapped = static_cast<C *>(Data);
     return (*Unwrapped)(MemoryPool);
   };

   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
 }

 } // namespace hsa

 /// Keep entries table per device
 struct FuncOrGblEntryTy {
   __tgt_target_table Table;
   std::vector<__tgt_offload_entry> Entries;
 };

 struct KernelArgPool {
 private:
   static pthread_mutex_t Mutex;

 public:
   uint32_t KernargSegmentSize;
   void *KernargRegion = nullptr;
   std::queue<int> FreeKernargSegments;

   uint32_t kernargSizeIncludingImplicit() {
     return KernargSegmentSize + sizeof(impl_implicit_args_t);
   }

   ~KernelArgPool() {
     if (KernargRegion) {
       auto R = hsa_amd_memory_pool_free(KernargRegion);
       if (R != HSA_STATUS_SUCCESS) {
         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
       }
     }
   }

   // Can't really copy or move a mutex
   KernelArgPool() = default;
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;

   KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
       : KernargSegmentSize(KernargSegmentSize) {

     // impl uses one pool per kernel for all gpus, with a fixed upper size
     // preserving that exact scheme here, including the queue<int>

     hsa_status_t Err = hsa_amd_memory_pool_allocate(
         MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
         &KernargRegion);

     if (Err != HSA_STATUS_SUCCESS) {
       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
       KernargRegion = nullptr; // paranoid
       return;
     }

     Err = core::allow_access_to_all_gpu_agents(KernargRegion);
     if (Err != HSA_STATUS_SUCCESS) {
       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
          get_error_string(Err));
       auto R = hsa_amd_memory_pool_free(KernargRegion);
       if (R != HSA_STATUS_SUCCESS) {
         // if free failed, can't do anything more to resolve it
         DP("hsa memory poll free failed: %s\n", get_error_string(Err));
       }
       KernargRegion = nullptr;
       return;
     }

     for (int I = 0; I < MAX_NUM_KERNELS; I++) {
       FreeKernargSegments.push(I);
     }
   }

   void *allocate(uint64_t ArgNum) {
     assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
     Lock L(&Mutex);
     void *Res = nullptr;
     if (!FreeKernargSegments.empty()) {

       int FreeIdx = FreeKernargSegments.front();
       Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
                                 (FreeIdx * kernargSizeIncludingImplicit()));
       assert(FreeIdx == pointerToIndex(Res));
       FreeKernargSegments.pop();
     }
     return Res;
   }

   void deallocate(void *Ptr) {
     Lock L(&Mutex);
     int Idx = pointerToIndex(Ptr);
     FreeKernargSegments.push(Idx);
   }

 private:
   int pointerToIndex(void *Ptr) {
     ptrdiff_t Bytes =
         static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
     assert(Bytes >= 0);
     assert(Bytes % kernargSizeIncludingImplicit() == 0);
     return Bytes / kernargSizeIncludingImplicit();
   }
   struct Lock {
     Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
     ~Lock() { pthread_mutex_unlock(M); }
     pthread_mutex_t *M;
   };
 };
 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;

 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
     KernelArgPoolMap;

 /// Use a single entity to encode a kernel and a set of flags
 struct KernelTy {
   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
   int16_t ConstWGSize;
   int32_t DeviceId;
   void *CallStackAddr = nullptr;
   const char *Name;

   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
            int32_t DeviceId, void *CallStackAddr, const char *Name,
            uint32_t KernargSegmentSize,
            hsa_amd_memory_pool_t &KernArgMemoryPool)
       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);

     std::string N(Name);
     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
       KernelArgPoolMap.insert(
           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
                                 KernargSegmentSize, KernArgMemoryPool))));
     }
   }
 };

 /// List that contains all the kernels.
 /// FIXME: we may need this to be per device and per library.
 std::list<KernelTy> KernelsList;

 template <typename Callback> static hsa_status_t findAgents(Callback CB) {

   hsa_status_t Err =
       hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
         hsa_device_type_t DeviceType;
         // get_info fails iff HSA runtime not yet initialized
         hsa_status_t Err =
             hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);

         if (Err != HSA_STATUS_SUCCESS) {
           if (print_kernel_trace > 0)
             DP("rtl.cpp: err %s\n", get_error_string(Err));

           return Err;
         }

         CB(DeviceType, Agent);
         return HSA_STATUS_SUCCESS;
       });

   // iterate_agents fails iff HSA runtime not yet initialized
   if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
     DP("rtl.cpp: err %s\n", get_error_string(Err));
   }

   return Err;
 }

 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
                           void *Data) {
   if (Status != HSA_STATUS_SUCCESS) {
     const char *StatusString;
     if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
       StatusString = "unavailable";
     }
     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
        Status, StatusString);
     abort();
   }
 }

 namespace core {
 namespace {

 bool checkResult(hsa_status_t Err, const char *ErrMsg) {
   if (Err == HSA_STATUS_SUCCESS)
     return true;

   REPORT("%s", ErrMsg);
   REPORT("%s", get_error_string(Err));
   return false;
 }

 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
   __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
 }

 uint16_t createHeader() {
   uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   return Header;
 }

 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
   bool AllocAllowed = false;
   hsa_status_t Err = hsa_amd_memory_pool_get_info(
       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
       &AllocAllowed);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Alloc allowed in memory pool check failed: %s\n",
        get_error_string(Err));
     return Err;
   }

   size_t Size = 0;
   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
                                      &Size);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Get memory pool size failed: %s\n", get_error_string(Err));
     return Err;
   }

   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
 }

 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
   std::vector<hsa_amd_memory_pool_t> *Result =
       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);

   hsa_status_t Err;
   if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
     return Err;
   }

   Result->push_back(MemoryPool);
   return HSA_STATUS_SUCCESS;
 }

 } // namespace
 } // namespace core

 struct EnvironmentVariables {
   int NumTeams;
   int TeamLimit;
   int TeamThreadLimit;
   int MaxTeamsDefault;
   int DynamicMemSize;
 };

 template <uint32_t wavesize>
 static constexpr const llvm::omp::GV &getGridValue() {
   return llvm::omp::getAMDGPUGridValues<wavesize>();
 }

 struct HSALifetime {
   // Wrapper around HSA used to ensure it is constructed before other types
   // and destructed after, which means said other types can use raii for
   // cleanup without risking running outside of the lifetime of HSA
   const hsa_status_t S;

   bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
   HSALifetime() : S(hsa_init()) {}

   ~HSALifetime() {
     if (S == HSA_STATUS_SUCCESS) {
       hsa_status_t Err = hsa_shut_down();
       if (Err != HSA_STATUS_SUCCESS) {
         // Can't call into HSA to get a string from the integer
         DP("Shutting down HSA failed: %d\n", Err);
       }
     }
   }
 };

 // Handle scheduling of multiple hsa_queue's per device to
 // multiple threads (one scheduler per device)
 class HSAQueueScheduler {
 public:
   HSAQueueScheduler() : Current(0) {}

   HSAQueueScheduler(const HSAQueueScheduler &) = delete;

   HSAQueueScheduler(HSAQueueScheduler &&Q) {
     Current = Q.Current.load();
     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
       HSAQueues[I] = Q.HSAQueues[I];
       Q.HSAQueues[I] = nullptr;
     }
   }

   // \return false if any HSA queue creation fails
   bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
       hsa_queue_t *Q = nullptr;
       hsa_status_t Rc =
           hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
                            callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
       if (Rc != HSA_STATUS_SUCCESS) {
         DP("Failed to create HSA queue %d\n", I);
         return false;
       }
       HSAQueues[I] = Q;
     }
     return true;
   }

   ~HSAQueueScheduler() {
     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
       if (HSAQueues[I]) {
         hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
         if (Err != HSA_STATUS_SUCCESS)
           DP("Error destroying HSA queue");
       }
     }
   }

   // \return next queue to use for device
   hsa_queue_t *next() {
     return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
                      NUM_QUEUES_PER_DEVICE];
   }

 private:
   // Number of queues per device
   enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
   hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
   std::atomic<uint8_t> Current;
 };

 /// Class containing all the device information
 class RTLDeviceInfoTy : HSALifetime {
   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;

   struct QueueDeleter {
     void operator()(hsa_queue_t *Q) {
       if (Q) {
         hsa_status_t Err = hsa_queue_destroy(Q);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
         }
       }
     }
   };

 public:
   bool ConstructionSucceeded = false;

   // load binary populates symbol tables and mutates various global state
   // run uses those symbol tables
   std::shared_timed_mutex LoadRunLock;

   int NumberOfDevices = 0;

   // GPU devices
   std::vector<hsa_agent_t> HSAAgents;
   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu

   // CPUs
   std::vector<hsa_agent_t> CPUAgents;

   // Device properties
   std::vector<int> ComputeUnits;
   std::vector<int> GroupsPerDevice;
   std::vector<int> ThreadsPerGroup;
   std::vector<int> WarpSize;
   std::vector<std::string> GPUName;
   std::vector<std::string> TargetID;

   // OpenMP properties
   std::vector<int> NumTeams;
   std::vector<int> NumThreads;

   // OpenMP Environment properties
   EnvironmentVariables Env;

   // OpenMP Requires Flags
   int64_t RequiresFlags;

   // Resource pools
   SignalPoolT FreeSignalPool;

   bool HostcallRequired = false;

   std::vector<hsa_executable_t> HSAExecutables;

   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;

   hsa_amd_memory_pool_t KernArgPool;

   // fine grained memory pool for host allocations
   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;

   // fine and coarse-grained memory pools per offloading device
   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;

   struct ImplFreePtrDeletor {
     void operator()(void *P) {
       core::Runtime::Memfree(P); // ignore failure to free
     }
   };

   // device_State shared across loaded binaries, error if inconsistent size
   std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
       DeviceStateStore;

   static const unsigned HardTeamLimit =
       (1 << 16) - 1; // 64K needed to fit in uint16
   static const int DefaultNumTeams = 128;

   // These need to be per-device since different devices can have different
   // wave sizes, but are currently the same number for each so that refactor
   // can be postponed.
   static_assert(getGridValue<32>().GV_Max_Teams ==
                     getGridValue<64>().GV_Max_Teams,
                 "");
   static const int MaxTeams = getGridValue<64>().GV_Max_Teams;

   static_assert(getGridValue<32>().GV_Max_WG_Size ==
                     getGridValue<64>().GV_Max_WG_Size,
                 "");
   static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;

   static_assert(getGridValue<32>().GV_Default_WG_Size ==
                     getGridValue<64>().GV_Default_WG_Size,
                 "");
   static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;

   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
                                       hsa_agent_t, hsa_amd_memory_pool_t);
   hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
                                     MemcpyFunc Func, int32_t DeviceId) {
     hsa_agent_t Agent = HSAAgents[DeviceId];
     hsa_signal_t S = FreeSignalPool.pop();
     if (S.handle == 0) {
       return HSA_STATUS_ERROR;
     }
     hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
     FreeSignalPool.push(S);
     return R;
   }

   hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
                                        int32_t DeviceId) {
     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
   }

   hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
                                        int32_t DeviceId) {
     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
   }

   static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
     char TmpChar[1000];
     uint16_t Major, Minor;
     uint32_t TmpUInt;
     uint32_t TmpUInt2;
     uint32_t CacheSize[4];
     bool TmpBool;
     uint16_t WorkgroupMaxDim[3];
     hsa_dim3_t GridMaxDim;

     // Getting basic information about HSA and Device
     core::checkResult(
         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
         "Error from hsa_system_get_info when obtaining "
         "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
     core::checkResult(
         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
         "Error from hsa_system_get_info when obtaining "
         "HSA_SYSTEM_INFO_VERSION_MINOR\n");
     printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
     printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
     core::checkResult(
         hsa_agent_get_info(
             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
     printf("    Product Name: \t\t\t%s \n", TmpChar);
     core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
                       "Error returned from hsa_agent_get_info when obtaining "
                       "HSA_AGENT_INFO_NAME\n");
     printf("    Device Name: \t\t\t%s \n", TmpChar);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_NAME\n");
     printf("    Vendor Name: \t\t\t%s \n", TmpChar);
     hsa_device_type_t DevType;
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_DEVICE\n");
     printf("    Device Type: \t\t\t%s \n",
            DevType == HSA_DEVICE_TYPE_CPU
                ? "CPU"
                : (DevType == HSA_DEVICE_TYPE_GPU
                       ? "GPU"
                       : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_QUEUES_MAX\n");
     printf("    Max Queues: \t\t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
     printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
     printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);

     // Getting cache information
     printf("    Cache:\n");

     // FIXME: This is deprecated according to HSA documentation. But using
     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
     // runtime.
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_CACHE_SIZE\n");

     for (int I = 0; I < 4; I++) {
       if (CacheSize[I]) {
         printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
       }
     }

     core::checkResult(
         hsa_agent_get_info(Agent,
                            (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
                            &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
     printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(
             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
             &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
     printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(
             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
             &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
     printf("    Compute Units: \t\t\t%u \n", TmpUInt);
     core::checkResult(hsa_agent_get_info(
                           Agent,
                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
                           &TmpUInt),
                       "Error returned from hsa_agent_get_info when obtaining "
                       "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
     printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
     printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
     printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
     printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
     core::checkResult(hsa_agent_get_info(Agent,
                                          HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
                                          WorkgroupMaxDim),
                       "Error returned from hsa_agent_get_info when obtaining "
                       "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
     printf("    Workgroup Max Size per Dimension:\n");
     printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
     printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
     printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
     core::checkResult(hsa_agent_get_info(
                           Agent,
                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
                           &TmpUInt),
                       "Error returned from hsa_agent_get_info when obtaining "
                       "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
     printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
     printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
     printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_GRID_MAX_DIM\n");
     printf("    Grid Max Size per Dimension: \t\t\n");
     printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
     printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
     printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
     core::checkResult(
         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
         "Error returned from hsa_agent_get_info when obtaining "
         "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
     printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);

     printf("    Memory Pools:\n");
     auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
       std::string TmpStr;
       size_t Size;
       bool Alloc, Access;
       hsa_amd_segment_t Segment;
       hsa_amd_memory_pool_global_flag_t GlobalFlags;
       core::checkResult(
           hsa_amd_memory_pool_get_info(
               Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
           "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
       core::checkResult(hsa_amd_memory_pool_get_info(
                             Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
                         "Error returned from hsa_amd_memory_pool_get_info when "
                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");

       switch (Segment) {
       case HSA_AMD_SEGMENT_GLOBAL:
         TmpStr = "GLOBAL; FLAGS: ";
         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
           TmpStr += "KERNARG, ";
         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
           TmpStr += "FINE GRAINED, ";
         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
           TmpStr += "COARSE GRAINED, ";
         break;
       case HSA_AMD_SEGMENT_READONLY:
         TmpStr = "READONLY";
         break;
       case HSA_AMD_SEGMENT_PRIVATE:
         TmpStr = "PRIVATE";
         break;
       case HSA_AMD_SEGMENT_GROUP:
         TmpStr = "GROUP";
         break;
       }
       printf("      Pool %s: \n", TmpStr.c_str());

       core::checkResult(hsa_amd_memory_pool_get_info(
                             Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
                         "Error returned from hsa_amd_memory_pool_get_info when "
                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
       printf("        Size: \t\t\t\t %zu bytes\n", Size);
       core::checkResult(
           hsa_amd_memory_pool_get_info(
               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
       printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
       core::checkResult(
           hsa_amd_memory_pool_get_info(
               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
       printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
       core::checkResult(
           hsa_amd_memory_pool_get_info(
               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
       printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
       core::checkResult(
           hsa_amd_memory_pool_get_info(
               Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
           "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
       printf("        Accessable by all: \t\t %s\n",
              (Access ? "TRUE" : "FALSE"));

       return HSA_STATUS_SUCCESS;
     };
     // Iterate over all the memory regions for this agent. Get the memory region
     // type and size
     hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);

     printf("    ISAs:\n");
     auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
       char TmpChar[1000];
       core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
                         "Error returned from hsa_isa_get_info_alt when "
                         "obtaining HSA_ISA_INFO_NAME\n");
       printf("        Name: \t\t\t\t %s\n", TmpChar);

       return HSA_STATUS_SUCCESS;
     };
     // Iterate over all the memory regions for this agent. Get the memory region
     // type and size
     hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
   }

   // Record entry point associated with device
   void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();

     E.Entries.push_back(Entry);
   }

   // Return true if the entry is associated with device
   bool findOffloadEntry(int32_t DeviceId, void *Addr) {
     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();

     for (auto &It : E.Entries) {
       if (It.addr == Addr)
         return true;
     }

     return false;
   }

   // Return the pointer to the target entries table
   __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();

     int32_t Size = E.Entries.size();

     // Table is empty
     if (!Size)
       return 0;

     __tgt_offload_entry *Begin = &E.Entries[0];
     __tgt_offload_entry *End = &E.Entries[Size - 1];

     // Update table info according to the entries and return the pointer
     E.Table.EntriesBegin = Begin;
     E.Table.EntriesEnd = ++End;

     return &E.Table;
   }

   // Clear entries table for a device
   void clearOffloadEntriesTable(int DeviceId) {
     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
     FuncGblEntries[DeviceId].emplace_back();
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
     // KernelArgPoolMap.clear();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }

   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
                                    unsigned int DeviceId) {
     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
     uint32_t GlobalFlags = 0;
     hsa_status_t Err = hsa_amd_memory_pool_get_info(
         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);

     if (Err != HSA_STATUS_SUCCESS) {
       return Err;
     }

     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
     }

     return HSA_STATUS_SUCCESS;
   }

   hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
     for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
       hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
           Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
             hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
             if (ValidStatus != HSA_STATUS_SUCCESS) {
               DP("Alloc allowed in memory pool check failed: %s\n",
                  get_error_string(ValidStatus));
               return HSA_STATUS_SUCCESS;
             }
             return addDeviceMemoryPool(MemoryPool, DeviceId);
           });

       if (Err != HSA_STATUS_SUCCESS) {
         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
            "Iterate all memory pools", get_error_string(Err));
         return Err;
       }
     }
     return HSA_STATUS_SUCCESS;
   }

   hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
     std::vector<hsa_amd_memory_pool_t> HostPools;

     // collect all the "valid" pools for all the given agents.
     for (const auto &Agent : Agents) {
       hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
           Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
       if (Err != HSA_STATUS_SUCCESS) {
         DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
       }
     }

     // We need two fine-grained pools.
     //  1. One with kernarg flag set for storing kernel arguments
     //  2. Second for host allocations
     bool FineGrainedMemoryPoolSet = false;
     bool KernArgPoolSet = false;
     for (const auto &MemoryPool : HostPools) {
       hsa_status_t Err = HSA_STATUS_SUCCESS;
       uint32_t GlobalFlags = 0;
       Err = hsa_amd_memory_pool_get_info(
           MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
       if (Err != HSA_STATUS_SUCCESS) {
         DP("Get memory pool info failed: %s\n", get_error_string(Err));
         return Err;
       }

       if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
         if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
           KernArgPool = MemoryPool;
           KernArgPoolSet = true;
         }
         HostFineGrainedMemoryPool = MemoryPool;
         FineGrainedMemoryPoolSet = true;
       }
     }

     if (FineGrainedMemoryPoolSet && KernArgPoolSet)
       return HSA_STATUS_SUCCESS;

     return HSA_STATUS_ERROR;
   }

   hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
            "Invalid device Id");
     return DeviceCoarseGrainedMemoryPools[DeviceId];
   }

   hsa_amd_memory_pool_t getHostMemoryPool() {
     return HostFineGrainedMemoryPool;
   }

   static int readEnv(const char *Env, int Default = -1) {
     const char *EnvStr = getenv(Env);
     int Res = Default;
     if (EnvStr) {
       Res = std::stoi(EnvStr);
       DP("Parsed %s=%d\n", Env, Res);
     }
     return Res;
   }

   RTLDeviceInfoTy() {
     DP("Start initializing " GETNAME(TARGET_NAME) "\n");

     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
     // anytime. You do not need a debug library build.
     //  0 => no tracing
     //  1 => tracing dispatch only
     // >1 => verbosity increase

     if (!HSAInitSuccess()) {
       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
       return;
     }

     if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
       print_kernel_trace = atoi(EnvStr);
     else
       print_kernel_trace = 0;

     hsa_status_t Err = core::atl_init_gpu_context();
     if (Err != HSA_STATUS_SUCCESS) {
       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
       return;
     }

     // Init hostcall soon after initializing hsa
     hostrpc_init();

     Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
         CPUAgents.push_back(Agent);
       } else {
         HSAAgents.push_back(Agent);
       }
     });
     if (Err != HSA_STATUS_SUCCESS)
       return;

     NumberOfDevices = (int)HSAAgents.size();

     if (NumberOfDevices == 0) {
       DP("There are no devices supporting HSA.\n");
       return;
     }
     DP("There are %d devices supporting HSA.\n", NumberOfDevices);

     // Init the device info
     HSAQueueSchedulers.reserve(NumberOfDevices);
     FuncGblEntries.resize(NumberOfDevices);
     ThreadsPerGroup.resize(NumberOfDevices);
     ComputeUnits.resize(NumberOfDevices);
     GPUName.resize(NumberOfDevices);
     GroupsPerDevice.resize(NumberOfDevices);
     WarpSize.resize(NumberOfDevices);
     NumTeams.resize(NumberOfDevices);
     NumThreads.resize(NumberOfDevices);
     DeviceStateStore.resize(NumberOfDevices);
     KernelInfoTable.resize(NumberOfDevices);
     SymbolInfoTable.resize(NumberOfDevices);
     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);

     Err = setupDevicePools(HSAAgents);
     if (Err != HSA_STATUS_SUCCESS) {
       DP("Setup for Device Memory Pools failed\n");
       return;
     }

     Err = setupHostMemoryPools(CPUAgents);
     if (Err != HSA_STATUS_SUCCESS) {
       DP("Setup for Host Memory Pools failed\n");
       return;
     }

     for (int I = 0; I < NumberOfDevices; I++) {
       uint32_t QueueSize = 0;
       {
         hsa_status_t Err = hsa_agent_get_info(
             HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
           return;
         }
         enum { MaxQueueSize = 4096 };
         if (QueueSize > MaxQueueSize) {
           QueueSize = MaxQueueSize;
         }
       }

       {
         HSAQueueScheduler QSched;
         if (!QSched.createQueues(HSAAgents[I], QueueSize))
           return;
         HSAQueueSchedulers.emplace_back(std::move(QSched));
       }

       DeviceStateStore[I] = {nullptr, 0};
     }

     for (int I = 0; I < NumberOfDevices; I++) {
       ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
       GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
       ComputeUnits[I] = 1;
       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
          GroupsPerDevice[I], ThreadsPerGroup[I]);
     }

     // Get environment variables regarding teams
     Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
     Env.NumTeams = readEnv("OMP_NUM_TEAMS");
     Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
     Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
     Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);

     // Default state.
     RequiresFlags = OMP_REQ_UNDEFINED;

     ConstructionSucceeded = true;
   }

   ~RTLDeviceInfoTy() {
     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
     if (!HSAInitSuccess()) {
       // Then none of these can have been set up and they can't be torn down
       return;
     }
     // Run destructors on types that use HSA before
     // impl_finalize removes access to it
     DeviceStateStore.clear();
     KernelArgPoolMap.clear();
     // Terminate hostrpc before finalizing hsa
     hostrpc_terminate();

     hsa_status_t Err;
     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
       Err = hsa_executable_destroy(HSAExecutables[I]);
       if (Err != HSA_STATUS_SUCCESS) {
         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
            "Destroying executable", get_error_string(Err));
       }
     }
   }
 };

 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;

 static RTLDeviceInfoTy DeviceInfo;

 namespace {

 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
                      __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   // Return success if we are not copying back to host from target.
   if (!HstPtr)
     return OFFLOAD_SUCCESS;
   hsa_status_t Err;
   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)TgtPtr,
      (long long unsigned)(Elf64_Addr)HstPtr);

   Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
                                            DeviceId);

   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when copying data from device to host. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
     return OFFLOAD_FAIL;
   }
   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)TgtPtr,
      (long long unsigned)(Elf64_Addr)HstPtr);
   return OFFLOAD_SUCCESS;
 }

 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
                    __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
   hsa_status_t Err;
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   // Return success if we are not doing host to target.
   if (!HstPtr)
     return OFFLOAD_SUCCESS;

   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
   Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
                                            DeviceId);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
     return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }

 // Async.
 // The implementation was written with cuda streams in mind. The semantics of
 // that are to execute kernels on a queue in order of insertion. A synchronise
 // call then makes writes visible between host and device. This means a series
 // of N data_submit_async calls are expected to execute serially. HSA offers
 // various options to run the data copies concurrently. This may require changes
 // to libomptarget.

 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
 // there are no outstanding kernels that need to be synchronized. Any async call
 // may be passed a Queue==0, at which point the cuda implementation will set it
 // to non-null (see getStream). The cuda streams are per-device. Upstream may
 // change this interface to explicitly initialize the AsyncInfo_pointer, but
 // until then hsa lazily initializes it as well.

 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
   // set non-null while using async calls, return to null to indicate completion
   assert(AsyncInfo);
   if (!AsyncInfo->Queue) {
     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
   }
 }
 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
   assert(AsyncInfo);
   assert(AsyncInfo->Queue);
   AsyncInfo->Queue = 0;
 }

 // Determine launch values for kernel.
 struct LaunchVals {
   int WorkgroupSize;
   int GridSize;
 };
 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
                          int ConstWGSize,
                          llvm::omp::OMPTgtExecModeFlags ExecutionMode,
                          int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
                          int DeviceNumTeams) {

   int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
   int NumGroups = 0;

   int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
   if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
     MaxTeams = RTLDeviceInfoTy::HardTeamLimit;

   if (print_kernel_trace & STARTUP_DETAILS) {
     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
     DP("Max_Teams: %d\n", MaxTeams);
     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
        RTLDeviceInfoTy::DefaultWgSize);
     DP("thread_limit: %d\n", ThreadLimit);
     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
     DP("ConstWGSize: %d\n", ConstWGSize);
   }
   // check for thread_limit() clause
   if (ThreadLimit > 0) {
     ThreadsPerGroup = ThreadLimit;
     DP("Setting threads per block to requested %d\n", ThreadLimit);
     // Add master warp for GENERIC
     if (ExecutionMode ==
         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
       ThreadsPerGroup += WarpSize;
       DP("Adding master wavefront: +%d threads\n", WarpSize);
     }
     if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
       ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
       DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
     }
   }
   // check flat_max_work_group_size attr here
   if (ThreadsPerGroup > ConstWGSize) {
     ThreadsPerGroup = ConstWGSize;
     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
        ThreadsPerGroup);
   }
   if (print_kernel_trace & STARTUP_DETAILS)
     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
   DP("Preparing %d threads\n", ThreadsPerGroup);

   // Set default num_groups (teams)
   if (Env.TeamLimit > 0)
     NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
   else
     NumGroups = MaxTeams;
   DP("Set default num of groups %d\n", NumGroups);

   if (print_kernel_trace & STARTUP_DETAILS) {
     DP("num_groups: %d\n", NumGroups);
     DP("num_teams: %d\n", NumTeams);
   }

   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
   // This reduction is typical for default case (no thread_limit clause).
   // or when user goes crazy with num_teams clause.
   // FIXME: We cant distinguish between a constant or variable thread limit.
   // So we only handle constant thread_limits.
   if (ThreadsPerGroup >
       RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
     // Should we round threadsPerGroup up to nearest WarpSize
     // here?
     NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;

   // check for num_teams() clause
   if (NumTeams > 0) {
     NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
   }
   if (print_kernel_trace & STARTUP_DETAILS) {
     DP("num_groups: %d\n", NumGroups);
     DP("Env.NumTeams %d\n", Env.NumTeams);
     DP("Env.TeamLimit %d\n", Env.TeamLimit);
   }

   if (Env.NumTeams > 0) {
     NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
   } else if (Env.TeamLimit > 0) {
     NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
   } else {
     if (NumTeams <= 0) {
       if (LoopTripcount > 0) {
         if (ExecutionMode ==
             llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
           // round up to the nearest integer
           NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
         } else if (ExecutionMode ==
                    llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
           NumGroups = LoopTripcount;
         } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
           // This is a generic kernel that was transformed to use SPMD-mode
           // execution but uses Generic-mode semantics for scheduling.
           NumGroups = LoopTripcount;
         }
         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
            "threads per block %d\n",
            NumGroups, LoopTripcount, ThreadsPerGroup);
       }
     } else {
       NumGroups = NumTeams;
     }
     if (NumGroups > MaxTeams) {
       NumGroups = MaxTeams;
       if (print_kernel_trace & STARTUP_DETAILS)
         DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
     }
     if (NumGroups > NumTeams && NumTeams > 0) {
       NumGroups = NumTeams;
       if (print_kernel_trace & STARTUP_DETAILS)
         DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
            NumTeams);
     }
   }

   // num_teams clause always honored, no matter what, unless DEFAULT is active.
   if (NumTeams > 0) {
     NumGroups = NumTeams;
     // Cap num_groups to EnvMaxTeamsDefault if set.
     if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
       NumGroups = Env.MaxTeamsDefault;
   }
   if (print_kernel_trace & STARTUP_DETAILS) {
     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
     DP("num_groups: %d\n", NumGroups);
     DP("loop_tripcount: %ld\n", LoopTripcount);
   }
   DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
      ThreadsPerGroup);

   LaunchVals Res;
   Res.WorkgroupSize = ThreadsPerGroup;
   Res.GridSize = ThreadsPerGroup * NumGroups;
   return Res;
 }

 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
   uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
   bool Full = true;
   while (Full) {
     Full =
         PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
   }
   return PacketId;
 }

 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
                         ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
                         int32_t ThreadLimit, uint64_t LoopTripcount) {
   // Set the context we are using
   // update thread limit content in gpu memory if un-initialized or specified
   // from host

   DP("Run target team region thread_limit %d\n", ThreadLimit);

   // All args are references.
   std::vector<void *> Args(ArgNum);
   std::vector<void *> Ptrs(ArgNum);

   DP("Arg_num: %d\n", ArgNum);
   for (int32_t I = 0; I < ArgNum; ++I) {
     Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
     Args[I] = &Ptrs[I];
     DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
   }

   KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;

   std::string KernelName = std::string(KernelInfo->Name);
   auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
   if (KernelInfoTable[DeviceId].find(KernelName) ==
       KernelInfoTable[DeviceId].end()) {
     DP("Kernel %s not found\n", KernelName.c_str());
     return OFFLOAD_FAIL;
   }

   const atl_kernel_info_t KernelInfoEntry =
       KernelInfoTable[DeviceId][KernelName];
   const uint32_t GroupSegmentSize =
       KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize;
   const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
   const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
   const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
   const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;

   assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);

   /*
    * Set limit based on ThreadsPerGroup and GroupsPerDevice
    */
   LaunchVals LV =
       getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env,
                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
                     NumTeams,      // From run_region arg
                     ThreadLimit,   // From run_region arg
                     LoopTripcount, // From run_region arg
                     DeviceInfo.NumTeams[KernelInfo->DeviceId]);
   const int GridSize = LV.GridSize;
   const int WorkgroupSize = LV.WorkgroupSize;

   if (print_kernel_trace >= LAUNCH) {
     int NumGroups = GridSize / WorkgroupSize;
     // enum modes are SPMD, GENERIC, NONE 0,1,2
     // if doing rtl timing, print to stderr, unless stdout requested.
     bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
     fprintf(TraceToStdout ? stdout : stderr,
             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
             DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
             ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
             GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
             VgprSpillCount, LoopTripcount, KernelInfo->Name);
   }

   // Run on the device.
   {
     hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next();
     if (!Queue) {
       return OFFLOAD_FAIL;
     }
     uint64_t PacketId = acquireAvailablePacketId(Queue);

     const uint32_t Mask = Queue->size - 1; // size is a power of 2
     hsa_kernel_dispatch_packet_t *Packet =
         (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);

     // packet->header is written last
     Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     Packet->workgroup_size_x = WorkgroupSize;
     Packet->workgroup_size_y = 1;
     Packet->workgroup_size_z = 1;
     Packet->reserved0 = 0;
     Packet->grid_size_x = GridSize;
     Packet->grid_size_y = 1;
     Packet->grid_size_z = 1;
     Packet->private_segment_size = KernelInfoEntry.private_segment_size;
     Packet->group_segment_size = GroupSegmentSize;
     Packet->kernel_object = KernelInfoEntry.kernel_object;
     Packet->kernarg_address = 0;     // use the block allocator
     Packet->reserved2 = 0;           // impl writes id_ here
     Packet->completion_signal = {0}; // may want a pool of signals

     KernelArgPool *ArgPool = nullptr;
     void *KernArg = nullptr;
     {
       auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
       if (It != KernelArgPoolMap.end()) {
         ArgPool = (It->second).get();
       }
     }
     if (!ArgPool) {
       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
          DeviceId);
     }
     {
       if (ArgPool) {
         assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
         KernArg = ArgPool->allocate(ArgNum);
       }
       if (!KernArg) {
         DP("Allocate kernarg failed\n");
         return OFFLOAD_FAIL;
       }

       // Copy explicit arguments
       for (int I = 0; I < ArgNum; I++) {
         memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
       }

       // Initialize implicit arguments. TODO: Which of these can be dropped
       impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
           static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
       memset(ImplArgs, 0,
              sizeof(impl_implicit_args_t)); // may not be necessary
       ImplArgs->offset_x = 0;
       ImplArgs->offset_y = 0;
       ImplArgs->offset_z = 0;

       // assign a hostcall buffer for the selected Q
       if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) {
         // hostrpc_assign_buffer is not thread safe, and this function is
         // under a multiple reader lock, not a writer lock.
         static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
         pthread_mutex_lock(&HostcallInitLock);
         uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId],
                                                 Queue, DeviceId);
         pthread_mutex_unlock(&HostcallInitLock);
         if (!Buffer) {
           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
              "error\n");
           return OFFLOAD_FAIL;
         }

         DP("Implicit argument count: %d\n",
            KernelInfoEntry.implicit_argument_count);
         if (KernelInfoEntry.implicit_argument_count >= 4) {
           // Initialise pointer for implicit_argument_count != 0 ABI
           // Guess that the right implicit argument is at offset 24 after
           // the explicit arguments. In the future, should be able to read
           // the offset from msgpack. Clang is not annotating it at present.
           uint64_t Offset =
               sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
           if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
             DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
                "args: %d\n",
                Offset + 8, ArgPool->kernargSizeIncludingImplicit());
           } else {
             memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
           }
         }

         // initialise pointer for implicit_argument_count == 0 ABI
         ImplArgs->hostcall_ptr = Buffer;
       }

       Packet->kernarg_address = KernArg;
     }

     hsa_signal_t S = DeviceInfo.FreeSignalPool.pop();
     if (S.handle == 0) {
       DP("Failed to get signal instance\n");
       return OFFLOAD_FAIL;
     }
     Packet->completion_signal = S;
     hsa_signal_store_relaxed(Packet->completion_signal, 1);

     // Publish the packet indicating it is ready to be processed
     core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
                              core::createHeader(), Packet->setup);

     // Since the packet is already published, its contents must not be
     // accessed any more
     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);

     while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
                                      HSA_WAIT_STATE_BLOCKED) != 0)
       ;

     assert(ArgPool);
     ArgPool->deallocate(KernArg);
     DeviceInfo.FreeSignalPool.push(S);
   }

   DP("Kernel completed\n");
   return OFFLOAD_SUCCESS;
 }

 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
   const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
   int32_t R = elf_check_machine(Image, AmdgcnMachineID);
   if (!R) {
     DP("Supported machine ID not found\n");
   }
   return R;
 }

 uint32_t elfEFlags(__tgt_device_image *Image) {
   char *ImgBegin = (char *)Image->ImageStart;
   size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;

   Elf *E = elf_memory(ImgBegin, ImgSize);
   if (!E) {
     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
     return 0;
   }

   Elf64_Ehdr *Eh64 = elf64_getehdr(E);

   if (!Eh64) {
     DP("Unable to get machine ID from ELF file!\n");
     elf_end(E);
     return 0;
   }

   uint32_t Flags = Eh64->e_flags;

   elf_end(E);
   DP("ELF Flags: 0x%x\n", Flags);
   return Flags;
 }

 template <typename T> bool enforceUpperBound(T *Value, T Upper) {
   bool Changed = *Value > Upper;
   if (Changed) {
     *Value = Upper;
   }
   return Changed;
 }

 Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
   size_t N;
   int Rc = elf_getshdrnum(Elf, &N);
   if (Rc != 0) {
     return nullptr;
   }

   Elf64_Shdr *Result = nullptr;
   for (size_t I = 0; I < N; I++) {
     Elf_Scn *Scn = elf_getscn(Elf, I);
     if (Scn) {
       Elf64_Shdr *Shdr = elf64_getshdr(Scn);
       if (Shdr) {
         if (Shdr->sh_type == SHT_HASH) {
           if (Result == nullptr) {
             Result = Shdr;
           } else {
             // multiple SHT_HASH sections not handled
             return nullptr;
           }
         }
       }
     }
   }
   return Result;
 }

 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
                            const char *Symname) {

   assert(SectionHash);
   size_t SectionSymtabIndex = SectionHash->sh_link;
   Elf64_Shdr *SectionSymtab =
       elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
   size_t SectionStrtabIndex = SectionSymtab->sh_link;

   const Elf64_Sym *Symtab =
       reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);

   const uint32_t *Hashtab =
       reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);

   // Layout:
   // nbucket
   // nchain
   // bucket[nbucket]
   // chain[nchain]
   uint32_t Nbucket = Hashtab[0];
   const uint32_t *Bucket = &Hashtab[2];
   const uint32_t *Chain = &Hashtab[Nbucket + 2];

   const size_t Max = strlen(Symname) + 1;
   const uint32_t Hash = elf_hash(Symname);
   for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
     char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
     if (strncmp(Symname, N, Max) == 0) {
       return &Symtab[I];
     }
   }

   return nullptr;
 }

 struct SymbolInfo {
   void *Addr = nullptr;
   uint32_t Size = UINT32_MAX;
   uint32_t ShType = SHT_NULL;
 };

 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
                                 SymbolInfo *Res) {
   if (elf_kind(Elf) != ELF_K_ELF) {
     return 1;
   }

   Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
   if (!SectionHash) {
     return 1;
   }

   const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
   if (!Sym) {
     return 1;
   }

   if (Sym->st_size > UINT32_MAX) {
     return 1;
   }

   if (Sym->st_shndx == SHN_UNDEF) {
     return 1;
   }

   Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
   if (!Section) {
     return 1;
   }

   Elf64_Shdr *Header = elf64_getshdr(Section);
   if (!Header) {
     return 1;
   }

   Res->Addr = Sym->st_value + Base;
   Res->Size = static_cast<uint32_t>(Sym->st_size);
   Res->ShType = Header->sh_type;
   return 0;
 }

 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
                                 SymbolInfo *Res) {
   Elf *Elf = elf_memory(Base, ImgSize);
   if (Elf) {
     int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
     elf_end(Elf);
     return Rc;
   }
   return 1;
 }

 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
                                   const char *SymName, void **VarAddr,
                                   uint32_t *VarSize) {
   SymbolInfo SI;
   int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
   if (Rc == 0) {
     *VarAddr = SI.Addr;
     *VarSize = SI.Size;
     return HSA_STATUS_SUCCESS;
   }
   return HSA_STATUS_ERROR;
 }

 template <typename C>
 hsa_status_t moduleRegisterFromMemoryToPlace(
     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
     void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
     std::vector<hsa_executable_t> &HSAExecutables) {
   auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
     C *Unwrapped = static_cast<C *>(CbState);
     return (*Unwrapped)(Data, Size);
   };
   return core::RegisterModuleFromMemory(
       KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
       DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
       HSAExecutables);
 }

 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
   uint64_t DeviceStateBytes = 0;
   {
     // If this is the deviceRTL, get the state variable size
     SymbolInfo SizeSi;
     int Rc = getSymbolInfoWithoutLoading(
         ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);

     if (Rc == 0) {
       if (SizeSi.Size != sizeof(uint64_t)) {
         DP("Found device_State_size variable with wrong size\n");
         return 0;
       }

       // Read number of bytes directly from the elf
       memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
     }
   }
   return DeviceStateBytes;
 }

 struct DeviceEnvironment {
   // initialise an DeviceEnvironmentTy in the deviceRTL
   // patches around differences in the deviceRTL between trunk, aomp,
   // rocmcc. Over time these differences will tend to zero and this class
   // simplified.
   // Symbol may be in .data or .bss, and may be missing fields, todo:
   // review aomp/trunk/rocm and simplify the following

   // The symbol may also have been deadstripped because the device side
   // accessors were unused.

   // If the symbol is in .data (aomp, rocm) it can be written directly.
   // If it is in .bss, we must wait for it to be allocated space on the
   // gpu (trunk) and initialize after loading.
   const char *sym() { return "omptarget_device_environment"; }

   DeviceEnvironmentTy HostDeviceEnv;
   SymbolInfo SI;
   bool Valid = false;

   __tgt_device_image *Image;
   const size_t ImgSize;

   DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
                     __tgt_device_image *Image, const size_t ImgSize)
       : Image(Image), ImgSize(ImgSize) {

     HostDeviceEnv.NumDevices = NumberDevices;
     HostDeviceEnv.DeviceNum = DeviceId;
     HostDeviceEnv.DebugKind = 0;
     HostDeviceEnv.DynamicMemSize = DynamicMemSize;
     if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
       HostDeviceEnv.DebugKind = std::stoi(EnvStr);

     int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
                                          sym(), &SI);
     if (Rc != 0) {
       DP("Finding global device environment '%s' - symbol missing.\n", sym());
       return;
     }

     if (SI.Size > sizeof(HostDeviceEnv)) {
       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
          sizeof(HostDeviceEnv));
       return;
     }

     Valid = true;
   }

   bool inImage() { return SI.ShType != SHT_NOBITS; }

   hsa_status_t beforeLoading(void *Data, size_t Size) {
     if (Valid) {
       if (inImage()) {
         DP("Setting global device environment before load (%u bytes)\n",
            SI.Size);
         uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
         void *Pos = (char *)Data + Offset;
         memcpy(Pos, &HostDeviceEnv, SI.Size);
       }
     }
     return HSA_STATUS_SUCCESS;
   }

   hsa_status_t afterLoading() {
     if (Valid) {
       if (!inImage()) {
         DP("Setting global device environment after load (%u bytes)\n",
            SI.Size);
         int DeviceId = HostDeviceEnv.DeviceNum;
         auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
         void *StatePtr;
         uint32_t StatePtrSize;
         hsa_status_t Err = interop_hsa_get_symbol_info(
             SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("failed to find %s in loaded image\n", sym());
           return Err;
         }

         if (StatePtrSize != SI.Size) {
           DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
              SI.Size);
           return HSA_STATUS_ERROR;
         }

         return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
                                                   StatePtrSize, DeviceId);
       }
     }
     return HSA_STATUS_SUCCESS;
   }
 };

 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
   uint64_t Rounded = 4 * ((Size + 3) / 4);
   void *Ptr;
   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
   if (Err != HSA_STATUS_SUCCESS) {
     return Err;
   }

   hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
   if (Rc != HSA_STATUS_SUCCESS) {
     DP("zero fill device_state failed with %u\n", Rc);
     core::Runtime::Memfree(Ptr);
     return HSA_STATUS_ERROR;
   }

   *RetPtr = Ptr;
   return HSA_STATUS_SUCCESS;
 }

 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
   SymbolInfo SI;
   int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
   return (Rc == 0) && (SI.Addr != nullptr);
 }

 } // namespace

 namespace core {
 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
   return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
                                      &DeviceInfo.HSAAgents[0], NULL, Ptr);
 }
 } // namespace core

 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
   hsa_status_t err;
   uint32_t name_len;
   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
   if (err != HSA_STATUS_SUCCESS) {
     DP("Error getting ISA info length\n");
     return err;
   }

   char TargetID[name_len];
   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
   if (err != HSA_STATUS_SUCCESS) {
     DP("Error getting ISA info name\n");
     return err;
   }

   auto TripleTargetID = llvm::StringRef(TargetID);
   if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
     DeviceInfo.TargetID.push_back(TripleTargetID.ltrim('-').str());
   }
   return HSA_STATUS_SUCCESS;
 }

 /// Parse a TargetID to get processor arch and feature map.
 /// Returns processor subarch.
 /// Returns TargetID features in \p FeatureMap argument.
 /// If the \p TargetID contains feature+, FeatureMap it to true.
 /// If the \p TargetID contains feature-, FeatureMap it to false.
 /// If the \p TargetID does not contain a feature (default), do not map it.
 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
   if (TargetID.empty())
     return llvm::StringRef();

   auto ArchFeature = TargetID.split(":");
   auto Arch = ArchFeature.first;
   auto Features = ArchFeature.second;
   if (Features.empty())
     return Arch;

   if (Features.contains("sramecc+")) {
     FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
   } else if (Features.contains("sramecc-")) {
     FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
   }
   if (Features.contains("xnack+")) {
     FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
   } else if (Features.contains("xnack-")) {
     FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
   }

   return Arch;
 }

 /// Checks if an image \p ImgInfo is compatible with current
 /// system's environment \p EnvInfo
 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
   llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);

   // Compatible in case of exact match
   if (ImgTID == EnvTID) {
     DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",
        ImgTID.data(), EnvTID.data());
     return true;
   }

   // Incompatible if Archs mismatch.
   StringMap<bool> ImgMap, EnvMap;
   StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
   StringRef EnvArch = parseTargetID(EnvTID, EnvMap);

   // Both EnvArch and ImgArch can't be empty here.
   if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
     DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",
        ImgTID.data(), EnvTID.data());
     return false;
   }

   // Incompatible if image has more features than the environment, irrespective
   // of type or sign of features.
   if (ImgMap.size() > EnvMap.size()) {
     DP("Incompatible: Image has more features than the environment \t[Image: "
        "%s]\t:\t[Environment: %s]\n",
        ImgTID.data(), EnvTID.data());
     return false;
   }

   // Compatible if each target feature specified by the environment is
   // compatible with target feature of the image. The target feature is
   // compatible if the iamge does not specify it (meaning Any), or if it
   // specifies it with the same value (meaning On or Off).
   for (const auto &ImgFeature : ImgMap) {
     auto EnvFeature = EnvMap.find(ImgFeature.first());
     if (EnvFeature == EnvMap.end()) {
       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
          "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "
          "%s]\n",
          ImgTID.data(), EnvTID.data());
       return false;
     } else if (EnvFeature->first() == ImgFeature.first() &&
                EnvFeature->second != ImgFeature.second) {
       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
          "the Environment feature's non-ANY value \t[Image: "
          "%s]\t:\t[Environment: %s]\n",
          ImgTID.data(), EnvTID.data());
       return false;
     }
   }

   // Image is compatible if all features of Environment are:
   //   - either, present in the Image's features map with the same sign,
   //   - or, the feature is missing from Image's features map i.e. it is
   //   set to ANY
   DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "
      "%s]\n",
      ImgTID.data(), EnvTID.data());
   return true;
 }

 extern "C" {
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
   return elfMachineIdIsAmdgcn(Image);
 }

 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
                                        __tgt_image_info *info) {
   if (!__tgt_rtl_is_valid_binary(image))
     return false;

   // A subarchitecture was not specified. Assume it is compatible.
   if (!info->Arch)
     return true;

   int32_t NumberOfDevices = __tgt_rtl_number_of_devices();

   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
     __tgt_rtl_init_device(DeviceId);
     hsa_agent_t agent = DeviceInfo.HSAAgents[DeviceId];
     hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
     if (err != HSA_STATUS_SUCCESS) {
       DP("Error iterating ISAs\n");
       return false;
     }
     if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo.TargetID[DeviceId]))
       return false;
   }
   DP("Image has Target ID compatible with the current environment: %s\n",
      info->Arch);
   return true;
 }

 int __tgt_rtl_number_of_devices() {
   // If the construction failed, no methods are safe to call
   if (DeviceInfo.ConstructionSucceeded) {
     return DeviceInfo.NumberOfDevices;
   }
   DP("AMDGPU plugin construction failed. Zero devices available\n");
   return 0;
 }

 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
   DP("Init requires flags to %ld\n", RequiresFlags);
   DeviceInfo.RequiresFlags = RequiresFlags;
   return RequiresFlags;
 }

 int32_t __tgt_rtl_init_device(int DeviceId) {
   hsa_status_t Err = hsa_init();
   if (Err != HSA_STATUS_SUCCESS) {
     DP("HSA Initialization Failed.\n");
     return HSA_STATUS_ERROR;
   }
   // this is per device id init
   DP("Initialize the device id: %d\n", DeviceId);

   hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId];

   // Get number of Compute Unit
   uint32_t ComputeUnits = 0;
   Err = hsa_agent_get_info(
       Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
       &ComputeUnits);
   if (Err != HSA_STATUS_SUCCESS) {
     DeviceInfo.ComputeUnits[DeviceId] = 1;
     DP("Error getting compute units : settiing to 1\n");
   } else {
     DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits;
     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]);
   }

   char GetInfoName[64]; // 64 max size returned by get info
   Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
                            (void *)GetInfoName);
   if (Err)
     DeviceInfo.GPUName[DeviceId] = "--unknown gpu--";
   else {
     DeviceInfo.GPUName[DeviceId] = GetInfoName;
   }

   if (print_kernel_trace & STARTUP_DETAILS)
     DP("Device#%-2d CU's: %2d %s\n", DeviceId,
        DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str());

   // Query attributes to determine number of threads/block and blocks/grid.
   uint16_t WorkgroupMaxDim[3];
   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
                            &WorkgroupMaxDim);
   if (Err != HSA_STATUS_SUCCESS) {
     DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
     DP("Error getting grid dims: num groups : %d\n",
        RTLDeviceInfoTy::DefaultNumTeams);
   } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
     DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
     DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]);
   } else {
     DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
        "at the hard limit\n",
        WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
   }

   // Get thread limit
   hsa_dim3_t GridMaxDim;
   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
   if (Err == HSA_STATUS_SUCCESS) {
     DeviceInfo.ThreadsPerGroup[DeviceId] =
         reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
         DeviceInfo.GroupsPerDevice[DeviceId];

     if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) {
       DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
       DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
     } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId],
                                  RTLDeviceInfoTy::MaxWgSize)) {
       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
     } else {
       DP("Using ROCm Queried thread limit: %d\n",
          DeviceInfo.ThreadsPerGroup[DeviceId]);
     }
   } else {
     DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
     DP("Error getting max block dimension, use default:%d \n",
        RTLDeviceInfoTy::MaxWgSize);
   }

   // Get wavefront size
   uint32_t WavefrontSize = 0;
   Err =
       hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
   if (Err == HSA_STATUS_SUCCESS) {
     DP("Queried wavefront size: %d\n", WavefrontSize);
     DeviceInfo.WarpSize[DeviceId] = WavefrontSize;
   } else {
     // TODO: Burn the wavefront size into the code object
     DP("Warning: Unknown wavefront size, assuming 64\n");
     DeviceInfo.WarpSize[DeviceId] = 64;
   }

   // Adjust teams to the env variables

   if (DeviceInfo.Env.TeamLimit > 0 &&
       (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId],
                          DeviceInfo.Env.TeamLimit))) {
     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
        DeviceInfo.Env.TeamLimit);
   }

   // Set default number of teams
   if (DeviceInfo.Env.NumTeams > 0) {
     DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams;
     DP("Default number of teams set according to environment %d\n",
        DeviceInfo.Env.NumTeams);
   } else {
     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
     int TeamsPerCU = DefaultTeamsPerCU;
     if (TeamsPerCUEnvStr) {
       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
     }

     DeviceInfo.NumTeams[DeviceId] =
         TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId];
     DP("Default number of teams = %d * number of compute units %d\n",
        TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]);
   }

   if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId],
                         DeviceInfo.GroupsPerDevice[DeviceId])) {
     DP("Default number of teams exceeds device limit, capping at %d\n",
        DeviceInfo.GroupsPerDevice[DeviceId]);
   }

   // Adjust threads to the env variables
   if (DeviceInfo.Env.TeamThreadLimit > 0 &&
       (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
                          DeviceInfo.Env.TeamThreadLimit))) {
     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
        DeviceInfo.Env.TeamThreadLimit);
   }

   // Set default number of threads
   DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
   DP("Default number of threads set according to library's default %d\n",
      RTLDeviceInfoTy::DefaultWgSize);
   if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
                         DeviceInfo.ThreadsPerGroup[DeviceId])) {
     DP("Default number of threads exceeds device limit, capping at %d\n",
        DeviceInfo.ThreadsPerGroup[DeviceId]);
   }

   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
      DeviceId, DeviceInfo.GroupsPerDevice[DeviceId],
      DeviceInfo.ThreadsPerGroup[DeviceId]);

   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
      DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId],
      DeviceInfo.GroupsPerDevice[DeviceId],
      DeviceInfo.GroupsPerDevice[DeviceId] *
          DeviceInfo.ThreadsPerGroup[DeviceId]);

   return OFFLOAD_SUCCESS;
 }

 static __tgt_target_table *
 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);

 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
                                           __tgt_device_image *Image) {
   DeviceInfo.LoadRunLock.lock();
   __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
   DeviceInfo.LoadRunLock.unlock();
   return Res;
 }

 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
                                                  __tgt_device_image *Image) {
   // This function loads the device image onto gpu[DeviceId] and does other
   // per-image initialization work. Specifically:
   //
   // - Initialize an DeviceEnvironmentTy instance embedded in the
   //   image at the symbol "omptarget_device_environment"
   //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
   //
   // - Allocate a large array per-gpu (could be moved to init_device)
   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
   //   - Allocate at least that many bytes of gpu memory
   //   - Zero initialize it
   //   - Write the pointer to the symbol omptarget_nvptx_device_State
   //
   // - Pulls some per-kernel information together from various sources and
   //   records it in the KernelsList for quicker access later
   //
   // The initialization can be done before or after loading the image onto the
   // gpu. This function presently does a mixture. Using the hsa api to get/set
   // the information is simpler to implement, in exchange for more complicated
   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
   // back from the gpu vs a hashtable lookup on the host.

   const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;

   DeviceInfo.clearOffloadEntriesTable(DeviceId);

   // We do not need to set the ELF version because the caller of this function
   // had to do that to decide the right runtime to use

   if (!elfMachineIdIsAmdgcn(Image))
     return NULL;

   {
     auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices,
                                  DeviceInfo.Env.DynamicMemSize, Image, ImgSize);

     auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId];
     auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
     hsa_status_t Err = moduleRegisterFromMemoryToPlace(
         KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
         [&](void *Data, size_t Size) {
           if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
             __atomic_store_n(&DeviceInfo.HostcallRequired, true,
                              __ATOMIC_RELEASE);
           }
           return Env.beforeLoading(Data, Size);
         },
         DeviceInfo.HSAExecutables);

     check("Module registering", Err);
     if (Err != HSA_STATUS_SUCCESS) {
       const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str();
       const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));

       if (strcmp(DeviceName, ElfName) != 0) {
         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
            " compiler flag: -march=<gpu>\n",
            DeviceName, ElfName);
       } else {
         DP("Error loading image onto GPU: %s\n", get_error_string(Err));
       }

       return NULL;
     }

     Err = Env.afterLoading();
     if (Err != HSA_STATUS_SUCCESS) {
       return NULL;
     }
   }

   DP("AMDGPU module successfully loaded!\n");

   {
     // the device_State array is either large value in bss or a void* that
     // needs to be assigned to a pointer to an array of size device_state_bytes
     // If absent, it has been deadstripped and needs no setup.

     void *StatePtr;
     uint32_t StatePtrSize;
     auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
     hsa_status_t Err = interop_hsa_get_symbol_info(
         SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
         &StatePtrSize);

     if (Err != HSA_STATUS_SUCCESS) {
       DP("No device_state symbol found, skipping initialization\n");
     } else {
       if (StatePtrSize < sizeof(void *)) {
         DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
            sizeof(void *));
         return NULL;
       }

       // if it's larger than a void*, assume it's a bss array and no further
       // initialization is required. Only try to set up a pointer for
       // sizeof(void*)
       if (StatePtrSize == sizeof(void *)) {
         uint64_t DeviceStateBytes =
             getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
         if (DeviceStateBytes == 0) {
           DP("Can't initialize device_State, missing size information\n");
           return NULL;
         }

         auto &DSS = DeviceInfo.DeviceStateStore[DeviceId];
         if (DSS.first.get() == nullptr) {
           assert(DSS.second == 0);
           void *Ptr = NULL;
           hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
           if (Err != HSA_STATUS_SUCCESS) {
             DP("Failed to allocate device_state array\n");
             return NULL;
           }
           DSS = {
               std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
               DeviceStateBytes,
           };
         }

         void *Ptr = DSS.first.get();
         if (DeviceStateBytes != DSS.second) {
           DP("Inconsistent sizes of device_State unsupported\n");
           return NULL;
         }

         // write ptr to device memory so it can be used by later kernels
         Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
                                                  DeviceId);
         if (Err != HSA_STATUS_SUCCESS) {
           DP("memcpy install of state_ptr failed\n");
           return NULL;
         }
       }
     }
   }

   // Here, we take advantage of the data that is appended after img_end to get
   // the symbols' name we need to load. This data consist of the host entries
   // begin and end as well as the target name (see the offloading linker script
   // creation in clang compiler).

   // Find the symbols in the module by name. The name can be obtain by
   // concatenating the host entry name with the target name

   __tgt_offload_entry *HostBegin = Image->EntriesBegin;
   __tgt_offload_entry *HostEnd = Image->EntriesEnd;

   for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {

     if (!E->addr) {
       // The host should have always something in the address to
       // uniquely identify the target region.
       DP("Analyzing host entry '<null>' (size = %lld)...\n",
          (unsigned long long)E->size);
       return NULL;
     }

     if (E->size) {
       __tgt_offload_entry Entry = *E;

       void *Varptr;
       uint32_t Varsize;

       auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
       hsa_status_t Err = interop_hsa_get_symbol_info(
           SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);

       if (Err != HSA_STATUS_SUCCESS) {
         // Inform the user what symbol prevented offloading
         DP("Loading global '%s' (Failed)\n", E->name);
         return NULL;
       }

       if (Varsize != E->size) {
         DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
            Varsize, E->size);
         return NULL;
       }

       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
          DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
       Entry.addr = (void *)Varptr;

       DeviceInfo.addOffloadEntry(DeviceId, Entry);

       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
           E->flags & OMP_DECLARE_TARGET_LINK) {
         // If unified memory is present any target link variables
         // can access host addresses directly. There is no longer a
         // need for device copies.
         Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr,
                                                  sizeof(void *), DeviceId);
         if (Err != HSA_STATUS_SUCCESS)
           DP("Error when copying USM\n");
         DP("Copy linked variable host address (" DPxMOD ")"
            "to device address (" DPxMOD ")\n",
            DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
       }

       continue;
     }

     DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));

     // errors in kernarg_segment_size previously treated as = 0 (or as undef)
     uint32_t KernargSegmentSize = 0;
     auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId];
     hsa_status_t Err = HSA_STATUS_SUCCESS;
     if (!E->name) {
       Err = HSA_STATUS_ERROR;
     } else {
       std::string KernelStr = std::string(E->name);
       auto It = KernelInfoMap.find(KernelStr);
       if (It != KernelInfoMap.end()) {
         atl_kernel_info_t Info = It->second;
         KernargSegmentSize = Info.kernel_segment_size;
       } else {
         Err = HSA_STATUS_ERROR;
       }
     }

     // default value GENERIC (in case symbol is missing from cubin file)
     llvm::omp::OMPTgtExecModeFlags ExecModeVal =
         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;

     // get flat group size if present, else Default_WG_Size
     int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;

     // get Kernel Descriptor if present.
     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
     struct KernDescValType {
       uint16_t Version;
       uint16_t TSize;
       uint16_t WGSize;
     };
     struct KernDescValType KernDescVal;
     std::string KernDescNameStr(E->name);
     KernDescNameStr += "_kern_desc";
     const char *KernDescName = KernDescNameStr.c_str();

     void *KernDescPtr;
     uint32_t KernDescSize;
     void *CallStackAddr = nullptr;
     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
                                &KernDescPtr, &KernDescSize);

     if (Err == HSA_STATUS_SUCCESS) {
       if ((size_t)KernDescSize != sizeof(KernDescVal))
         DP("Loading global computation properties '%s' - size mismatch (%u != "
            "%lu)\n",
            KernDescName, KernDescSize, sizeof(KernDescVal));

       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);

       // Check structure size against recorded size.
       if ((size_t)KernDescSize != KernDescVal.TSize)
         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);

       DP("After loading global for %s KernDesc \n", KernDescName);
       DP("KernDesc: Version: %d\n", KernDescVal.Version);
       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
       DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);

       if (KernDescVal.WGSize == 0) {
         KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
       }
       WGSizeVal = KernDescVal.WGSize;
       DP("WGSizeVal %d\n", WGSizeVal);
       check("Loading KernDesc computation property", Err);
     } else {
       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);

       // Flat group size
       std::string WGSizeNameStr(E->name);
       WGSizeNameStr += "_wg_size";
       const char *WGSizeName = WGSizeNameStr.c_str();

       void *WGSizePtr;
       uint32_t WGSize;
       Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
                                  &WGSizePtr, &WGSize);

       if (Err == HSA_STATUS_SUCCESS) {
         if ((size_t)WGSize != sizeof(int16_t)) {
           DP("Loading global computation properties '%s' - size mismatch (%u "
              "!= "
              "%lu)\n",
              WGSizeName, WGSize, sizeof(int16_t));
           return NULL;
         }

         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);

         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);

         if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
             WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
           DP("Error wrong WGSize value specified in HSA code object file: "
              "%d\n",
              WGSizeVal);
           WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
         }
       } else {
         DP("Warning: Loading WGSize '%s' - symbol not found, "
            "using default value %d\n",
            WGSizeName, WGSizeVal);
       }

       check("Loading WGSize computation property", Err);
     }

     // Read execution mode from global in binary
     std::string ExecModeNameStr(E->name);
     ExecModeNameStr += "_exec_mode";
     const char *ExecModeName = ExecModeNameStr.c_str();

     void *ExecModePtr;
     uint32_t VarSize;
     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
                                &ExecModePtr, &VarSize);

     if (Err == HSA_STATUS_SUCCESS) {
       if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
         DP("Loading global computation properties '%s' - size mismatch(%u != "
            "%lu)\n",
            ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
         return NULL;
       }

       memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);

       DP("After loading global for %s ExecMode = %d\n", ExecModeName,
          ExecModeVal);

       if (ExecModeVal < 0 ||
           ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
         DP("Error wrong exec_mode value specified in HSA code object file: "
            "%d\n",
            ExecModeVal);
         return NULL;
       }
     } else {
       DP("Loading global exec_mode '%s' - symbol missing, using default "
          "value "
          "GENERIC (1)\n",
          ExecModeName);
     }
     check("Loading computation property", Err);

     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
                                    CallStackAddr, E->name, KernargSegmentSize,
                                    DeviceInfo.KernArgPool));
     __tgt_offload_entry Entry = *E;
     Entry.addr = (void *)&KernelsList.back();
     DeviceInfo.addOffloadEntry(DeviceId, Entry);
     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
   }

   return DeviceInfo.getOffloadEntriesTable(DeviceId);
 }

 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
   void *Ptr = NULL;
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");

   if (Kind != TARGET_ALLOC_DEFAULT) {
     REPORT("Invalid target data allocation kind or requested allocator not "
            "implemented yet\n");
     return NULL;
   }

   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)Ptr);
   Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
   return Ptr;
 }

 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
                               int64_t Size) {
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   __tgt_async_info AsyncInfo;
   int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
   if (Rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;

   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
 }

 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
                                     int64_t Size, __tgt_async_info *AsyncInfo) {
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   if (AsyncInfo) {
     initAsyncInfo(AsyncInfo);
     return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
   }
   return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
 }

 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
                                 int64_t Size) {
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   __tgt_async_info AsyncInfo;
   int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
   if (Rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;

   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
 }

 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
                                       int64_t Size,
                                       __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   initAsyncInfo(AsyncInfo);
   return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
 }

 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
   hsa_status_t Err;
   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
   Err = core::Runtime::Memfree(TgtPtr);
   if (Err != HSA_STATUS_SUCCESS) {
     DP("Error when freeing CUDA memory\n");
     return OFFLOAD_FAIL;
   }
   return OFFLOAD_SUCCESS;
 }

 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
                                          int32_t ArgNum, int32_t NumTeams,
                                          int32_t ThreadLimit,
                                          uint64_t LoopTripcount) {

   DeviceInfo.LoadRunLock.lock_shared();
   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);

   DeviceInfo.LoadRunLock.unlock_shared();
   return Res;
 }

 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
                                     int32_t ArgNum) {
   // use one team and one thread
   // fix thread num
   int32_t TeamNum = 1;
   int32_t ThreadLimit = 0; // use default
   return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
                                           TgtOffsets, ArgNum, TeamNum,
                                           ThreadLimit, 0);
 }

 int32_t __tgt_rtl_run_target_team_region_async(
     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
     int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
     uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");
   initAsyncInfo(AsyncInfo);

   DeviceInfo.LoadRunLock.lock_shared();
   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);

   DeviceInfo.LoadRunLock.unlock_shared();
   return Res;
 }

 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
                                           int32_t ArgNum,
                                           __tgt_async_info *AsyncInfo) {
   // use one team and one thread
   // fix thread num
   int32_t TeamNum = 1;
   int32_t ThreadLimit = 0; // use default
   return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
                                                 TgtOffsets, ArgNum, TeamNum,
                                                 ThreadLimit, 0, AsyncInfo);
 }

 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
   assert(AsyncInfo && "AsyncInfo is nullptr");

   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
   // is not ensured by devices.cpp for amdgcn
   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
   if (AsyncInfo->Queue) {
     finiAsyncInfo(AsyncInfo);
   }
   return OFFLOAD_SUCCESS;
 }

 void __tgt_rtl_print_device_info(int32_t DeviceId) {
   // TODO: Assertion to see if DeviceId is correct
   // NOTE: We don't need to set context for print device info.

   DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]);
 }

 } // extern "C"