blob: 189406ac1dc2a1e8bac27f1f6f6f6dd0082ef804 [file] [log] [blame]
//===- PluginInterface.h - Target independent plugin device interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
#include <cstddef>
#include <cstdint>
#include <deque>
#include <list>
#include <map>
#include <shared_mutex>
#include <vector>
#include "Debug.h"
#include "DeviceEnvironment.h"
#include "GlobalHandler.h"
#include "JIT.h"
#include "MemoryManager.h"
#include "Utilities.h"
#include "omptarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBufferRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
namespace omp {
namespace target {
namespace plugin {
struct GenericPluginTy;
struct GenericKernelTy;
struct GenericDeviceTy;
/// Class that wraps the __tgt_async_info to simply its usage. In case the
/// object is constructed without a valid __tgt_async_info, the object will use
/// an internal one and will synchronize the current thread with the pending
/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
/// must be called before destroying the wrapper object.
struct AsyncInfoWrapperTy {
AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
~AsyncInfoWrapperTy() {
assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
}
/// Get the raw __tgt_async_info pointer.
operator __tgt_async_info *() const { return AsyncInfoPtr; }
/// Get a reference to the underlying plugin-specific queue type.
template <typename Ty> Ty &getQueueAs() const {
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
"Queue is not of the same size as target type");
return reinterpret_cast<Ty &>(AsyncInfoPtr->Queue);
}
/// Indicate whether there is queue.
bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
/// Synchronize with the __tgt_async_info's pending operations if it's the
/// internal async info. The error associated to the aysnchronous operations
/// issued in this queue must be provided in \p Err. This function will update
/// the error parameter with the result of the synchronization if it was
/// actually executed. This function must be called before destroying the
/// object and only once.
void finalize(Error &Err);
private:
GenericDeviceTy &Device;
__tgt_async_info LocalAsyncInfo;
__tgt_async_info *AsyncInfoPtr;
};
/// The information level represents the level of a key-value property in the
/// info tree print (i.e. indentation). The first level should be the default.
enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
/// Class for storing device information and later be printed. An object of this
/// type acts as a queue of key-value properties. Each property has a key, a
/// a value, and an optional unit for the value. For printing purposes, the
/// information can be classified into several levels. These levels are useful
/// for defining sections and subsections. Thus, each key-value property also
/// has an additional field indicating to which level belongs to. Notice that
/// we use the level to determine the indentation of the key-value property at
/// printing time. See the enum InfoLevelKind for the list of accepted levels.
class InfoQueueTy {
struct InfoQueueEntryTy {
std::string Key;
std::string Value;
std::string Units;
uint64_t Level;
};
std::deque<InfoQueueEntryTy> Queue;
public:
/// Add a new info entry to the queue. The entry requires at least a key
/// string in \p Key. The value in \p Value is optional and can be any type
/// that is representable as a string. The units in \p Units is optional and
/// must be a string. The info level is a template parameter that defaults to
/// the first level (top level).
template <InfoLevelKind L = InfoLevel1, typename T = std::string>
void add(const std::string &Key, T Value = T(),
const std::string &Units = std::string()) {
assert(!Key.empty() && "Invalid info key");
// Convert the value to a string depending on its type.
if constexpr (std::is_same_v<T, bool>)
Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
else if constexpr (std::is_arithmetic_v<T>)
Queue.push_back({Key, std::to_string(Value), Units, L});
else
Queue.push_back({Key, Value, Units, L});
}
/// Print all info entries added to the queue.
void print() const {
// We print four spances for each level.
constexpr uint64_t IndentSize = 4;
// Find the maximum key length (level + key) to compute the individual
// indentation of each entry.
uint64_t MaxKeySize = 0;
for (const auto &Entry : Queue) {
uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
if (KeySize > MaxKeySize)
MaxKeySize = KeySize;
}
// Print all info entries.
for (const auto &Entry : Queue) {
// Compute the indentations for the current entry.
uint64_t KeyIndentSize = Entry.Level * IndentSize;
uint64_t ValIndentSize =
MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
<< std::string(ValIndentSize, ' ') << Entry.Value
<< (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
}
}
};
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
class DeviceImageTy {
/// Class representing the offload entry table. The class stores the
/// __tgt_target_table and a map to search in the table faster.
struct OffloadEntryTableTy {
/// Add new entry to the table.
void addEntry(const __tgt_offload_entry &Entry) {
Entries.push_back(Entry);
TTTablePtr.EntriesBegin = &Entries[0];
TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size();
}
/// Get the raw pointer to the __tgt_target_table.
operator __tgt_target_table *() {
if (Entries.empty())
return nullptr;
return &TTTablePtr;
}
private:
__tgt_target_table TTTablePtr;
llvm::SmallVector<__tgt_offload_entry> Entries;
};
/// Image identifier within the corresponding device. Notice that this id is
/// not unique between different device; they may overlap.
int32_t ImageId;
/// The pointer to the raw __tgt_device_image.
const __tgt_device_image *TgtImage;
const __tgt_device_image *TgtImageBitcode;
/// Table of offload entries.
OffloadEntryTableTy OffloadEntryTable;
public:
DeviceImageTy(int32_t Id, const __tgt_device_image *Image)
: ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr) {
assert(TgtImage && "Invalid target image");
}
/// Get the image identifier within the device.
int32_t getId() const { return ImageId; }
/// Get the pointer to the raw __tgt_device_image.
const __tgt_device_image *getTgtImage() const { return TgtImage; }
void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
this->TgtImageBitcode = TgtImageBitcode;
}
const __tgt_device_image *getTgtImageBitcode() const {
return TgtImageBitcode;
}
/// Get the image starting address.
void *getStart() const { return TgtImage->ImageStart; }
/// Get the image size.
size_t getSize() const {
return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
}
/// Get a memory buffer reference to the whole image.
MemoryBufferRef getMemoryBuffer() const {
return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
"Image");
}
/// Get a reference to the offload entry table for the image.
OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; }
};
/// Class implementing common functionalities of offload kernels. Each plugin
/// should define the specific kernel class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericKernelTy {
/// Construct a kernel with a name and a execution mode.
GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
: Name(Name), ExecutionMode(ExecutionMode),
PreferredNumThreads(0), MaxNumThreads(0) {}
virtual ~GenericKernelTy() {}
/// Initialize the kernel object from a specific device.
Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
virtual Error initImpl(GenericDeviceTy &GenericDevice,
DeviceImageTy &Image) = 0;
/// Launch the kernel on the specific device. The device must be the same
/// one used to initialize the kernel.
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
const char *getName() const { return Name; }
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
switch (ExecutionMode) {
case OMP_TGT_EXEC_MODE_SPMD:
case OMP_TGT_EXEC_MODE_GENERIC:
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return true;
}
return false;
}
protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
switch (ExecutionMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
}
llvm_unreachable("Unknown execution mode!");
}
/// Prints generic kernel launch information.
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const;
/// Prints plugin-specific kernel launch information after generic kernel
/// launch information
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const;
private:
/// Prepare the arguments before launching the kernel.
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, int32_t NumArgs,
llvm::SmallVectorImpl<void *> &Args,
llvm::SmallVectorImpl<void *> &Ptrs) const;
/// Get the default number of threads and blocks for the kernel.
virtual uint32_t getDefaultNumThreads(GenericDeviceTy &Device) const = 0;
virtual uint32_t getDefaultNumBlocks(GenericDeviceTy &Device) const = 0;
/// Get the number of threads and blocks for the kernel based on the
/// user-defined threads and block clauses.
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
uint32_t ThreadLimitClause[3]) const;
/// The number of threads \p NumThreads can be adjusted by this method.
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
uint32_t &NumThreads) const;
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
bool isGenericSPMDMode() const {
return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
}
bool isGenericMode() const {
return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
}
bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
/// The kernel name.
const char *Name;
/// The execution flags of the kernel.
OMPTgtExecModeFlags ExecutionMode;
protected:
/// The preferred number of threads to run the kernel.
uint32_t PreferredNumThreads;
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
};
/// Class representing a map of host pinned allocations. We track these pinned
/// allocations, so memory tranfers invloving these buffers can be optimized.
class PinnedAllocationMapTy {
/// Struct representing a map entry.
struct EntryTy {
/// The host pointer of the pinned allocation.
void *HstPtr;
/// The pointer that devices' driver should use to transfer data from/to the
/// pinned allocation. In most plugins, this pointer will be the same as the
/// host pointer above.
void *DevAccessiblePtr;
/// The size of the pinned allocation.
size_t Size;
/// Indicate whether the allocation was locked from outside the plugin, for
/// instance, from the application. The externally locked allocations are
/// not unlocked by the plugin when unregistering the last user.
bool ExternallyLocked;
/// The number of references to the pinned allocation. The allocation should
/// remain pinned and registered to the map until the number of references
/// becomes zero.
mutable size_t References;
/// Create an entry with the host and device acessible pointers, the buffer
/// size, and a boolean indicating whether the buffer was locked externally.
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
bool ExternallyLocked)
: HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
ExternallyLocked(ExternallyLocked), References(1) {}
/// Utility constructor used for std::set searches.
EntryTy(void *HstPtr)
: HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
ExternallyLocked(false), References(0) {}
};
/// Comparator of mep entries. Use the host pointer to enforce an order
/// between entries.
struct EntryCmpTy {
bool operator()(const EntryTy &Left, const EntryTy &Right) const {
return Left.HstPtr < Right.HstPtr;
}
};
typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
/// The map of host pinned allocations.
PinnedAllocSetTy Allocs;
/// The mutex to protect accesses to the map.
mutable std::shared_mutex Mutex;
/// Reference to the corresponding device.
GenericDeviceTy &Device;
/// Indicate whether mapped host buffers should be locked automatically.
bool LockMappedBuffers;
/// Indicate whether failures when locking mapped buffers should be ingored.
bool IgnoreLockMappedFailures;
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
/// map's mutex is acquired.
const EntryTy *findIntersecting(const void *HstPtr) const {
if (Allocs.empty())
return nullptr;
// Search the first allocation with starting address that is not less than
// the buffer address.
auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
// Direct match of starting addresses.
if (It != Allocs.end() && It->HstPtr == HstPtr)
return &(*It);
// Not direct match but may be a previous pinned allocation in the map which
// contains the buffer. Return false if there is no such a previous
// allocation.
if (It == Allocs.begin())
return nullptr;
// Move to the previous pinned allocation.
--It;
// The buffer is not contained in the pinned allocation.
if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
return &(*It);
// None found.
return nullptr;
}
/// Insert an entry to the map representing a locked buffer. The number of
/// references is set to one.
Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
bool ExternallyLocked = false);
/// Erase an existing entry from the map.
Error eraseEntry(const EntryTy &Entry);
/// Register a new user into an entry that represents a locked buffer. Check
/// also that the registered buffer with \p HstPtr address and \p Size is
/// actually contained into the entry.
Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
/// Unregister a user from the entry and return whether it is the last user.
/// If it is the last user, the entry will have to be removed from the map
/// and unlock the entry's host buffer (if necessary).
Expected<bool> unregisterEntryUse(const EntryTy &Entry);
/// Indicate whether the first range A fully contains the second range B.
static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
void *EndA = advanceVoidPtr(PtrA, SizeA);
void *EndB = advanceVoidPtr(PtrB, SizeB);
return (PtrB >= PtrA && EndB <= EndA);
}
/// Indicate whether the first range A intersects with the second range B.
static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
void *EndA = advanceVoidPtr(PtrA, SizeA);
void *EndB = advanceVoidPtr(PtrB, SizeB);
return (PtrA < EndB && PtrB < EndA);
}
public:
/// Create the map of pinned allocations corresponding to a specific device.
PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
// Envar that indicates whether mapped host buffers should be locked
// automatically. The possible values are boolean (on/off) and a special:
// off: Mapped host buffers are not locked.
// on: Mapped host buffers are locked in a best-effort approach.
// Failure to lock the buffers are silent.
// mandatory: Mapped host buffers are always locked and failures to lock
// a buffer results in a fatal error.
StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
"off");
bool Enabled;
if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
// Parsed as a boolean value. Enable the feature if necessary.
LockMappedBuffers = Enabled;
IgnoreLockMappedFailures = true;
} else if (OMPX_LockMappedBuffers.get() == "mandatory") {
// Enable the feature and failures are fatal.
LockMappedBuffers = true;
IgnoreLockMappedFailures = false;
} else {
// Disable by default.
DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
OMPX_LockMappedBuffers.get().data());
LockMappedBuffers = false;
}
}
/// Register a buffer that was recently allocated as a locked host buffer.
/// None of the already registered pinned allocations should intersect with
/// this new one. The registration requires the host pointer in \p HstPtr,
/// the device accessible pointer in \p DevAccessiblePtr, and the size of the
/// allocation in \p Size. The allocation must be unregistered using the
/// unregisterHostBuffer function.
Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
/// Unregister a host pinned allocation passing the host pointer which was
/// previously registered using the registerHostBuffer function. When calling
/// this function, the pinned allocation cannot have any other user and will
/// not be unlocked by this function.
Error unregisterHostBuffer(void *HstPtr);
/// Lock the host buffer at \p HstPtr or register a new user if it intersects
/// with an already existing one. A partial overlapping with extension is not
/// allowed. The function returns the device accessible pointer of the pinned
/// buffer. The buffer must be unlocked using the unlockHostBuffer function.
Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
/// Unlock the host buffer at \p HstPtr or unregister a user if other users
/// are still using the pinned allocation. If this was the last user, the
/// pinned allocation is removed from the map and the memory is unlocked.
Error unlockHostBuffer(void *HstPtr);
/// Lock or register a host buffer that was recently mapped by libomptarget.
/// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
/// enabled. Even if not enabled, externally locked buffers are registered
/// in order to optimize their transfers.
Error lockMappedHostBuffer(void *HstPtr, size_t Size);
/// Unlock or unregister a host buffer that was unmapped by libomptarget.
Error unlockUnmappedHostBuffer(void *HstPtr);
/// Return the device accessible pointer associated to the host pinned
/// allocation which the \p HstPtr belongs, if any. Return null in case the
/// \p HstPtr does not belong to any host pinned allocation. The device
/// accessible pointer is the one that devices should use for data transfers
/// that involve a host pinned buffer.
void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
std::shared_lock<std::shared_mutex> Lock(Mutex);
// Find the intersecting allocation if any.
const EntryTy *Entry = findIntersecting(HstPtr);
if (!Entry)
return nullptr;
return advanceVoidPtr(Entry->DevAccessiblePtr,
getPtrDiff(HstPtr, Entry->HstPtr));
}
/// Check whether a buffer belongs to a registered host pinned allocation.
bool isHostPinnedBuffer(const void *HstPtr) const {
std::shared_lock<std::shared_mutex> Lock(Mutex);
// Return whether there is an intersecting allocation.
return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
}
};
/// Class implementing common functionalities of offload devices. Each plugin
/// should define the specific device class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericDeviceTy : public DeviceAllocatorTy {
/// Construct a device with its device id within the plugin, the number of
/// devices in the plugin and the grid values for that kind of device.
GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
const llvm::omp::GV &GridValues);
/// Get the device identifier within the corresponding plugin. Notice that
/// this id is not unique between different plugins; they may overlap.
int32_t getDeviceId() const { return DeviceId; }
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
/// Initialize the device. After this call, the device should be already
/// working and ready to accept queries or modifications.
Error init(GenericPluginTy &Plugin);
virtual Error initImpl(GenericPluginTy &Plugin) = 0;
/// Deinitialize the device and free all its resources. After this call, the
/// device is no longer considered ready, so no queries or modifications are
/// allowed.
Error deinit();
virtual Error deinitImpl() = 0;
/// Load the binary image into the device and return the target table.
Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin,
const __tgt_device_image *TgtImage);
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
/// Setup the device environment if needed. Notice this setup may not be run
/// on some plugins. By default, it will be executed, but plugins can change
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
/// Register the offload entries for a specific image on the device.
Error registerOffloadEntries(DeviceImageTy &Image);
/// Synchronize the current thread with the pending operations on the
/// __tgt_async_info structure.
Error synchronize(__tgt_async_info *AsyncInfo);
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
/// Query for the completion of the pending operations on the __tgt_async_info
/// structure in a non-blocking manner.
Error queryAsync(__tgt_async_info *AsyncInfo);
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
/// Allocate data on the device or involving the device.
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
/// Deallocate data from the device or involving the device.
Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
/// Pin host memory to optimize transfers and return the device accessible
/// pointer that devices should use for memory transfers involving the host
/// pinned allocation.
Expected<void *> dataLock(void *HstPtr, int64_t Size) {
return PinnedAllocs.lockHostBuffer(HstPtr, Size);
}
/// Unpin a host memory buffer that was previously pinned.
Error dataUnlock(void *HstPtr) {
return PinnedAllocs.unlockHostBuffer(HstPtr);
}
/// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
/// API and return the device accessible pointer.
virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
/// Unlock a previously locked host buffer starting at \p HstPtr.
virtual Error dataUnlockImpl(void *HstPtr) = 0;
/// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
/// buffer. This means that libomptarget created a new mapping of that host
/// buffer (e.g., because a user OpenMP target map) and the buffer may be used
/// as source/destination of memory transfers. We can use this information to
/// lock the host buffer and optimize its memory transfers.
Error notifyDataMapped(void *HstPtr, int64_t Size) {
return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
}
/// Mark the host buffer with address \p HstPtr as unmapped. This means that
/// libomptarget removed an existing mapping. If the plugin locked the buffer
/// in notifyDataMapped, this function should unlock it.
Error notifyDataUnmapped(void *HstPtr) {
return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
}
/// Check whether the host buffer with address \p HstPtr is pinned by the
/// underlying vendor-specific runtime (if any). Retrieve the host pointer,
/// the device accessible pointer and the size of the original pinned buffer.
virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
void *&BaseDevAccessiblePtr,
size_t &BaseSize) const = 0;
/// Submit data to the device (host to device transfer).
Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Retrieve data from the device (device to host transfer).
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Exchange data between devices (device to device transfer). Calling this
/// function is only valid if GenericPlugin::isDataExchangable() passing the
/// two devices returns true.
Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo);
virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
/// Initialize a __tgt_async_info structure. Related to interop features.
Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Initialize a __tgt_device_info structure. Related to interop features.
Error initDeviceInfo(__tgt_device_info *DeviceInfo);
virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
/// Create an event.
Error createEvent(void **EventPtrStorage);
virtual Error createEventImpl(void **EventPtrStorage) = 0;
/// Destroy an event.
Error destroyEvent(void *Event);
virtual Error destroyEventImpl(void *EventPtr) = 0;
/// Start the recording of the event.
Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
virtual Error recordEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Wait for an event to finish. Notice this wait is asynchronous if the
/// __tgt_async_info is not nullptr.
Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
virtual Error waitEventImpl(void *EventPtr,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
/// Synchronize the current thread with the event.
Error syncEvent(void *EventPtr);
virtual Error syncEventImpl(void *EventPtr) = 0;
/// Print information about the device.
Error printInfo();
virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
/// Getters of the grid values.
uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
uint32_t getDefaultNumThreads() const {
return GridValues.GV_Default_WG_Size;
}
uint32_t getDefaultNumBlocks() const {
return GridValues.GV_Default_Num_Teams;
}
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
/// Get target compute unit kind (e.g., sm_80, or gfx908).
virtual std::string getComputeUnitKind() const { return "unknown"; }
/// Post processing after jit backend. The ownership of \p MB will be taken.
virtual Expected<std::unique_ptr<MemoryBuffer>>
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
return std::move(MB);
}
/// The minimum number of threads we use for a low-trip count combined loop.
/// Instead of using more threads we increase the outer (block/team)
/// parallelism.
/// @see OMPX_MinThreadsForLowTripCount
virtual uint32_t getMinThreadsForLowTripCountLoop() {
return OMPX_MinThreadsForLowTripCount;
}
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
const __tgt_offload_entry &GlobalEntry,
__tgt_offload_entry &DeviceEntry);
/// Register offload entry for kernel function.
Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage,
const __tgt_offload_entry &KernelEntry,
__tgt_offload_entry &DeviceEntry);
/// Allocate and construct a kernel object.
virtual Expected<GenericKernelTy *>
constructKernelEntry(const __tgt_offload_entry &KernelEntry,
DeviceImageTy &Image) = 0;
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
virtual Error getDeviceStackSize(uint64_t &V) = 0;
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
/// Indicate whether the device should setup the device environment. Notice
/// that returning false in this function will change the behavior of the
/// setupDeviceEnvironment() function.
virtual bool shouldSetupDeviceEnvironment() const { return true; }
/// Pointer to the memory manager or nullptr if not available.
MemoryManagerTy *MemoryManager;
/// Environment variables defined by the OpenMP standard.
Int32Envar OMP_TeamLimit;
Int32Envar OMP_NumTeams;
Int32Envar OMP_TeamsThreadLimit;
/// Environment variables defined by the LLVM OpenMP implementation.
Int32Envar OMPX_DebugKind;
UInt32Envar OMPX_SharedMemorySize;
UInt64Envar OMPX_TargetStackSize;
UInt64Envar OMPX_TargetHeapSize;
/// Environment flag to set the minimum number of threads we use for a
/// low-trip count combined loop. Instead of using more threads we increase
/// the outer (block/team) parallelism.
UInt32Envar OMPX_MinThreadsForLowTripCount =
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
protected:
/// Return the execution mode used for kernel \p Name.
Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
DeviceImageTy &Image);
/// Environment variables defined by the LLVM OpenMP implementation
/// regarding the initial number of streams and events.
UInt32Envar OMPX_InitialNumStreams;
UInt32Envar OMPX_InitialNumEvents;
/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
/// The identifier of the device within the plugin. Notice this is not a
/// global device id and is not the device id visible to the OpenMP user.
const int32_t DeviceId;
/// The default grid values used for this device.
llvm::omp::GV GridValues;
/// Enumeration used for representing the current state between two devices
/// two devices (both under the same plugin) for the peer access between them.
/// The states can be a) PENDING when the state has not been queried and needs
/// to be queried, b) AVAILABLE when the peer access is available to be used,
/// and c) UNAVAILABLE if the system does not allow it.
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
/// Array of peer access states with the rest of devices. This means that if
/// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
/// the device I can access device J's memory directly. However, notice this
/// does not mean that device J can access device I's memory directly.
llvm::SmallVector<PeerAccessState> PeerAccesses;
std::mutex PeerAccessesLock;
/// Map of host pinned allocations used for optimize device transfers.
PinnedAllocationMapTy PinnedAllocs;
};
/// Class implementing common functionalities of offload plugins. Each plugin
/// should define the specific plugin class, derive from this generic one, and
/// implement the necessary virtual function members.
struct GenericPluginTy {
/// Construct a plugin instance.
GenericPluginTy(Triple::ArchType TA)
: RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA) {}
virtual ~GenericPluginTy() {}
/// Initialize the plugin.
Error init();
/// Initialize the plugin and return the number of available devices.
virtual Expected<int32_t> initImpl() = 0;
/// Deinitialize the plugin and release the resources.
Error deinit();
virtual Error deinitImpl() = 0;
/// Get the reference to the device with a certain device id.
GenericDeviceTy &getDevice(int32_t DeviceId) {
assert(isValidDeviceId(DeviceId) && "Invalid device id");
assert(Devices[DeviceId] && "Device is unitialized");
return *Devices[DeviceId];
}
/// Get the number of active devices.
int32_t getNumDevices() const { return NumDevices; }
/// Get the ELF code to recognize the binary image of this plugin.
virtual uint16_t getMagicElfBits() const = 0;
/// Get the target triple of this plugin.
virtual Triple::ArchType getTripleArch() const = 0;
/// Allocate a structure using the internal allocator.
template <typename Ty> Ty *allocate() {
return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
}
/// Get the reference to the global handler of this plugin.
GenericGlobalHandlerTy &getGlobalHandler() {
assert(GlobalHandler && "Global handler not initialized");
return *GlobalHandler;
}
/// Get the reference to the JIT used for all devices connected to this
/// plugin.
JITEngine &getJIT() { return JIT; }
/// Get the OpenMP requires flags set for this plugin.
int64_t getRequiresFlags() const { return RequiresFlags; }
/// Set the OpenMP requires flags for this plugin.
void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
/// Initialize a device within the plugin.
Error initDevice(int32_t DeviceId);
/// Deinitialize a device within the plugin and release its resources.
Error deinitDevice(int32_t DeviceId);
/// Indicate whether data can be exchanged directly between two devices under
/// this same plugin. If this function returns true, it's safe to call the
/// GenericDeviceTy::exchangeData() function on the source device.
virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
}
/// Indicate if an image is compatible with the plugin devices. Notice that
/// this function may be called before actually initializing the devices. So
/// we could not move this function into GenericDeviceTy.
virtual Expected<bool> isImageCompatible(__tgt_image_info *Info) const = 0;
/// Indicate whether the plugin supports empty images.
virtual bool supportsEmptyImages() const { return false; }
protected:
/// Indicate whether a device id is valid.
bool isValidDeviceId(int32_t DeviceId) const {
return (DeviceId >= 0 && DeviceId < getNumDevices());
}
private:
/// Number of devices available for the plugin.
int32_t NumDevices;
/// Array of pointers to the devices. Initially, they are all set to nullptr.
/// Once a device is initialized, the pointer is stored in the position given
/// by its device id. A position with nullptr means that the corresponding
/// device was not initialized yet.
llvm::SmallVector<GenericDeviceTy *> Devices;
/// OpenMP requires flags.
int64_t RequiresFlags;
/// Pointer to the global handler for this plugin.
GenericGlobalHandlerTy *GlobalHandler;
/// Internal allocator for different structures.
BumpPtrAllocator Allocator;
/// The JIT engine shared by all devices connected to this plugin.
JITEngine JIT;
};
/// Class for simplifying the getter operation of the plugin. Anywhere on the
/// code, the current plugin can be retrieved by Plugin::get(). The class also
/// declares functions to create plugin-specific object instances. The check(),
/// createPlugin(), createDevice() and createGlobalHandler() functions should be
/// defined by each plugin implementation.
class Plugin {
// Reference to the plugin instance.
static GenericPluginTy *SpecificPlugin;
Plugin() {
if (auto Err = init())
REPORT("Failed to initialize plugin: %s\n",
toString(std::move(Err)).data());
}
~Plugin() {
if (auto Err = deinit())
REPORT("Failed to deinitialize plugin: %s\n",
toString(std::move(Err)).data());
}
Plugin(const Plugin &) = delete;
void operator=(const Plugin &) = delete;
/// Create and intialize the plugin instance.
static Error init() {
assert(!SpecificPlugin && "Plugin already created");
// Create the specific plugin.
SpecificPlugin = createPlugin();
assert(SpecificPlugin && "Plugin was not created");
// Initialize the plugin.
return SpecificPlugin->init();
}
// Deinitialize and destroy the plugin instance.
static Error deinit() {
assert(SpecificPlugin && "Plugin no longer valid");
// Deinitialize the plugin.
if (auto Err = SpecificPlugin->deinit())
return Err;
// Delete the plugin instance.
delete SpecificPlugin;
// Invalidate the plugin reference.
SpecificPlugin = nullptr;
return Plugin::success();
}
public:
/// Initialize the plugin if needed. The plugin could have been initialized by
/// a previous call to Plugin::get().
static Error initIfNeeded() {
// Trigger the initialization if needed.
get();
return Error::success();
}
// Deinitialize the plugin if needed. The plugin could have been deinitialized
// because the plugin library was exiting.
static Error deinitIfNeeded() {
// Do nothing. The plugin is deinitialized automatically.
return Plugin::success();
}
/// Get a reference (or create if it was not created) to the plugin instance.
static GenericPluginTy &get() {
// This static variable will initialize the underlying plugin instance in
// case there was no previous explicit initialization. The initialization is
// thread safe.
static Plugin Plugin;
assert(SpecificPlugin && "Plugin is not active");
return *SpecificPlugin;
}
/// Get a reference to the plugin with a specific plugin-specific type.
template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
/// Indicate whether the plugin is active.
static bool isActive() { return SpecificPlugin != nullptr; }
/// Create a success error. This is the same as calling Error::success(), but
/// it is recommended to use this one for consistency with Plugin::error() and
/// Plugin::check().
static Error success() { return Error::success(); }
/// Create a string error.
template <typename... ArgsTy>
static Error error(const char *ErrFmt, ArgsTy... Args) {
return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
}
/// Check the plugin-specific error code and return an error or success
/// accordingly. In case of an error, create a string error with the error
/// description. The ErrFmt should follow the format:
/// "Error in <function name>[<optional info>]: %s"
/// The last format specifier "%s" is mandatory and will be used to place the
/// error code's description. Notice this function should be only called from
/// the plugin-specific code.
template <typename... ArgsTy>
static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
/// Create a plugin instance.
static GenericPluginTy *createPlugin();
/// Create a plugin-specific device.
static GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices);
/// Create a plugin-specific global handler.
static GenericGlobalHandlerTy *createGlobalHandler();
};
/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
/// acts as a reference to a device resource, such as a stream, and requires
/// some basic functions to be implemented. The derived class should define an
/// empty constructor that creates an empty and invalid resource reference. Do
/// not create a new resource on the ctor, but on the create() function instead.
struct GenericDeviceResourceRef {
/// Create a new resource and stores a reference.
virtual Error create(GenericDeviceTy &Device) = 0;
/// Destroy and release the resources pointed by the reference.
virtual Error destroy(GenericDeviceTy &Device) = 0;
protected:
~GenericDeviceResourceRef() = default;
};
/// Class that implements a resource pool belonging to a device. This class
/// operates with references to the actual resources. These reference must
/// derive from the GenericDeviceResourceRef class and implement the create
/// and destroy virtual functions.
template <typename ResourceRef> class GenericDeviceResourceManagerTy {
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
public:
/// Create an empty resource pool for a specific device.
GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
: Device(Device), NextAvailable(0) {}
/// Destroy the resource pool. At this point, the deinit() function should
/// already have been executed so the resource pool should be empty.
virtual ~GenericDeviceResourceManagerTy() {
assert(ResourcePool.empty() && "Resource pool not empty");
}
/// Initialize the resource pool.
Error init(uint32_t InitialSize) {
assert(ResourcePool.empty() && "Resource pool already initialized");
return ResourcePoolTy::resizeResourcePool(InitialSize);
}
/// Deinitialize the resource pool and delete all resources. This function
/// must be called before the destructor.
Error deinit() {
if (NextAvailable)
DP("Missing %d resources to be returned\n", NextAvailable);
// TODO: This prevents a bug on libomptarget to make the plugins fail. There
// may be some resources not returned. Do not destroy these ones.
if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
return Err;
ResourcePool.clear();
return Plugin::success();
}
/// Get resource from the pool or create new resources.
ResourceRef getResource() {
const std::lock_guard<std::mutex> Lock(Mutex);
assert(NextAvailable <= ResourcePool.size() &&
"Resource pool is corrupted");
if (NextAvailable == ResourcePool.size()) {
// By default we double the resource pool every time.
if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable * 2)) {
REPORT("Failure to resize the resource pool: %s",
toString(std::move(Err)).data());
// Return an empty reference.
return ResourceRef();
}
}
return ResourcePool[NextAvailable++];
}
/// Return resource to the pool.
void returnResource(ResourceRef Resource) {
const std::lock_guard<std::mutex> Lock(Mutex);
assert(NextAvailable > 0 && "Resource pool is corrupted");
ResourcePool[--NextAvailable] = Resource;
}
private:
/// The resources between \p OldSize and \p NewSize need to be created or
/// destroyed. The mutex is locked when this function is called.
Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
assert(OldSize != NewSize && "Resizing to the same size");
if (auto Err = Device.setContext())
return Err;
if (OldSize < NewSize) {
// Create new resources.
for (uint32_t I = OldSize; I < NewSize; ++I) {
if (auto Err = ResourcePool[I].create(Device))
return Err;
}
} else {
// Destroy the obsolete resources.
for (uint32_t I = NewSize; I < OldSize; ++I) {
if (auto Err = ResourcePool[I].destroy(Device))
return Err;
}
}
return Plugin::success();
}
/// Increase or decrease the number of resources. This function should
/// be called with the mutex acquired.
Error resizeResourcePool(uint32_t NewSize) {
uint32_t OldSize = ResourcePool.size();
// Nothing to do.
if (OldSize == NewSize)
return Plugin::success();
if (OldSize < NewSize) {
// Increase the number of resources.
ResourcePool.resize(NewSize);
return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
}
// Decrease the number of resources otherwise.
auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
ResourcePool.resize(NewSize);
return Err;
}
/// The device to which the resources belong
GenericDeviceTy &Device;
/// Mutex for the resource pool.
std::mutex Mutex;
/// The next available resource in the pool.
uint32_t NextAvailable;
/// The actual resource pool.
std::deque<ResourceRef> ResourcePool;
};
} // namespace plugin
} // namespace target
} // namespace omp
} // namespace llvm
#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H