| //===-- DeviceMemory.h - Types representing device memory -------*- C++ -*-===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| /// |
| /// \file |
| /// This file defines types that represent device memory buffers. Two memory |
| /// spaces are represented here: global and shared. Host code can have a handle |
| /// to device global memory, and that handle can be used to copy data to and |
| /// from the device. Host code cannot have a handle to device shared memory |
| /// because that memory only exists during the execution of a kernel. |
| /// |
| /// GlobalDeviceMemory<T> is a handle to an array of elements of type T in |
| /// global device memory. It is similar to a pair of a std::unique_ptr<T> and an |
| /// element count to tell how many elements of type T fit in the memory pointed |
| /// to by that T*. |
| /// |
| /// SharedDeviceMemory<T> is just the size in elements of an array of elements |
| /// of type T in device shared memory. No resources are actually attached to |
| /// this class, it is just like a memo to the device to allocate space in shared |
| /// memory. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef STREAMEXECUTOR_DEVICEMEMORY_H |
| #define STREAMEXECUTOR_DEVICEMEMORY_H |
| |
| #include <cassert> |
| #include <cstddef> |
| |
| #include "streamexecutor/Error.h" |
| |
| namespace streamexecutor { |
| |
| class Device; |
| |
| template <typename ElemT> class GlobalDeviceMemory; |
| |
| /// Reference to a slice of device memory. |
| /// |
| /// Contains a base memory handle, an element count offset into that base |
| /// memory, and an element count for the size of the slice. |
| template <typename ElemT> class GlobalDeviceMemorySlice { |
| public: |
| using ElementTy = ElemT; |
| |
| /// Intentionally implicit so GlobalDeviceMemory<T> can be passed to functions |
| /// expecting GlobalDeviceMemorySlice<T> arguments. |
| GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &Memory) |
| : BaseMemory(Memory), ElementOffset(0), |
| ElementCount(Memory.getElementCount()) {} |
| |
| GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &BaseMemory, |
| size_t ElementOffset, size_t ElementCount) |
| : BaseMemory(BaseMemory), ElementOffset(ElementOffset), |
| ElementCount(ElementCount) { |
| assert(ElementOffset + ElementCount <= BaseMemory.getElementCount() && |
| "slicing past the end of a GlobalDeviceMemory buffer"); |
| } |
| |
| /// Gets the GlobalDeviceMemory backing this slice. |
| const GlobalDeviceMemory<ElemT> &getBaseMemory() const { return BaseMemory; } |
| |
| /// Gets the offset of this slice from the base memory. |
| /// |
| /// The offset is measured in elements, not bytes. |
| size_t getElementOffset() const { return ElementOffset; } |
| |
| /// Gets the number of elements in this slice. |
| size_t getElementCount() const { return ElementCount; } |
| |
| /// Returns the number of bytes that can fit in this slice. |
| size_t getByteCount() const { return ElementCount * sizeof(ElemT); } |
| |
| /// Creates a slice of the memory with the first DropCount elements removed. |
| LLVM_ATTRIBUTE_UNUSED_RESULT |
| GlobalDeviceMemorySlice<ElemT> slice(size_t DropCount) const { |
| assert(DropCount <= ElementCount && |
| "dropping more than the size of a slice"); |
| return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount, |
| ElementCount - DropCount); |
| } |
| |
| /// Creates a slice of the memory with the last DropCount elements removed. |
| LLVM_ATTRIBUTE_UNUSED_RESULT |
| GlobalDeviceMemorySlice<ElemT> drop_back(size_t DropCount) const { |
| assert(DropCount <= ElementCount && |
| "dropping more than the size of a slice"); |
| return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset, |
| ElementCount - DropCount); |
| } |
| |
| /// Creates a slice of the memory that chops off the first DropCount elements |
| /// and keeps the next TakeCount elements. |
| LLVM_ATTRIBUTE_UNUSED_RESULT |
| GlobalDeviceMemorySlice<ElemT> slice(size_t DropCount, |
| size_t TakeCount) const { |
| assert(DropCount + TakeCount <= ElementCount && |
| "sub-slice operation overruns slice bounds"); |
| return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount, |
| TakeCount); |
| } |
| |
| private: |
| const GlobalDeviceMemory<ElemT> &BaseMemory; |
| size_t ElementOffset; |
| size_t ElementCount; |
| }; |
| |
| /// Wrapper around a generic global device memory allocation. |
| /// |
| /// This class represents a buffer of untyped bytes in the global memory space |
| /// of a device. See GlobalDeviceMemory<T> for the corresponding type that |
| /// includes type information for the elements in its buffer. |
| /// |
| /// This is effectively a pair consisting of an opaque handle and a buffer size |
| /// in bytes. The opaque handle is a platform-dependent handle to the actual |
| /// memory that is allocated on the device. |
| /// |
| /// In some cases, such as in the CUDA platform, the opaque handle may actually |
| /// be a pointer in the virtual address space and it may be valid to perform |
| /// arithmetic on it to obtain other device pointers, but this is not the case |
| /// in general. |
| /// |
| /// For example, in the OpenCL platform, the handle is a pointer to a _cl_mem |
| /// handle object which really is completely opaque to the user. |
| class GlobalDeviceMemoryBase { |
| public: |
| /// Returns an opaque handle to the underlying memory. |
| const void *getHandle() const { return Handle; } |
| |
| /// Returns the address of the opaque handle as stored by this object. |
| const void *const *getHandleAddress() const { return &Handle; } |
| |
| // Cannot copy because the handle must be owned by a single object. |
| GlobalDeviceMemoryBase(const GlobalDeviceMemoryBase &) = delete; |
| GlobalDeviceMemoryBase &operator=(const GlobalDeviceMemoryBase &) = delete; |
| |
| protected: |
| /// Creates a GlobalDeviceMemoryBase from a handle and a byte count. |
| GlobalDeviceMemoryBase(Device *D, const void *Handle, size_t ByteCount) |
| : TheDevice(D), Handle(Handle), ByteCount(ByteCount) {} |
| |
| /// Transfer ownership of the underlying handle. |
| GlobalDeviceMemoryBase(GlobalDeviceMemoryBase &&Other) noexcept |
| : TheDevice(Other.TheDevice), Handle(Other.Handle), |
| ByteCount(Other.ByteCount) { |
| Other.TheDevice = nullptr; |
| Other.Handle = nullptr; |
| Other.ByteCount = 0; |
| } |
| |
| GlobalDeviceMemoryBase &operator=(GlobalDeviceMemoryBase &&Other) noexcept { |
| TheDevice = Other.TheDevice; |
| Handle = Other.Handle; |
| ByteCount = Other.ByteCount; |
| Other.TheDevice = nullptr; |
| Other.Handle = nullptr; |
| Other.ByteCount = 0; |
| return *this; |
| } |
| |
| ~GlobalDeviceMemoryBase(); |
| |
| Device *TheDevice; // Pointer to the device on which this memory lives. |
| const void *Handle; // Platform-dependent value representing allocated memory. |
| size_t ByteCount; // Size in bytes of this allocation. |
| }; |
| |
| /// Typed wrapper around the "void *"-like GlobalDeviceMemoryBase class. |
| /// |
| /// For example, GlobalDeviceMemory<int> is a simple wrapper around |
| /// GlobalDeviceMemoryBase that represents a buffer of integers stored in global |
| /// device memory. |
| template <typename ElemT> |
| class GlobalDeviceMemory : public GlobalDeviceMemoryBase { |
| public: |
| using ElementTy = ElemT; |
| |
| GlobalDeviceMemory(GlobalDeviceMemory &&) noexcept; |
| GlobalDeviceMemory &operator=(GlobalDeviceMemory &&) noexcept; |
| |
| /// Returns the number of elements of type ElemT that constitute this |
| /// allocation. |
| size_t getElementCount() const { return ByteCount / sizeof(ElemT); } |
| |
| /// Returns the number of bytes that can fit in this memory buffer. |
| size_t getByteCount() const { return ByteCount; } |
| |
| /// Converts this memory object into a slice. |
| GlobalDeviceMemorySlice<ElemT> asSlice() const { |
| return GlobalDeviceMemorySlice<ElemT>(*this); |
| } |
| |
| private: |
| GlobalDeviceMemory(const GlobalDeviceMemory &) = delete; |
| GlobalDeviceMemory &operator=(const GlobalDeviceMemory &) = delete; |
| |
| // Only a Device can create a GlobalDeviceMemory instance. |
| friend Device; |
| GlobalDeviceMemory(Device *D, const void *Handle, size_t ElementCount) |
| : GlobalDeviceMemoryBase(D, Handle, ElementCount * sizeof(ElemT)) {} |
| }; |
| |
| template <typename ElemT> |
| GlobalDeviceMemory<ElemT>::GlobalDeviceMemory( |
| GlobalDeviceMemory<ElemT> &&) noexcept = default; |
| |
| template <typename ElemT> |
| GlobalDeviceMemory<ElemT> &GlobalDeviceMemory<ElemT>:: |
| operator=(GlobalDeviceMemory<ElemT> &&) noexcept = default; |
| |
| /// A class to represent the size of a dynamic shared memory buffer of elements |
| /// of type T on a device. |
| /// |
| /// Shared memory buffers exist only on the device and cannot be manipulated |
| /// from the host, so instances of this class do not have an opaque handle, only |
| /// a size. |
| /// |
| /// This type of memory is called "local" memory in OpenCL and "shared" memory |
| /// in CUDA, and both platforms follow the rule that the host code only knows |
| /// the size of these buffers and does not have a handle to them. |
| /// |
| /// The treatment of shared memory in StreamExecutor matches the way it is done |
| /// in OpenCL, where a kernel takes any number of shared memory sizes as kernel |
| /// function arguments. |
| /// |
| /// In CUDA only one shared memory size argument is allowed per kernel call. |
| /// StreamExecutor handles this by allowing CUDA kernel signatures that take |
| /// multiple SharedDeviceMemory arguments, and simply adding together all the |
| /// shared memory sizes to get the final shared memory size that is used to |
| /// launch the kernel. |
| template <typename ElemT> class SharedDeviceMemory { |
| public: |
| /// Creates a typed area of shared device memory with a given number of |
| /// elements. |
| static SharedDeviceMemory<ElemT> makeFromElementCount(size_t ElementCount) { |
| return SharedDeviceMemory(ElementCount); |
| } |
| |
| /// Copyable because it is just an array size. |
| SharedDeviceMemory(const SharedDeviceMemory &) = default; |
| |
| /// Copy-assignable because it is just an array size. |
| SharedDeviceMemory &operator=(const SharedDeviceMemory &) = default; |
| |
| /// Returns the number of elements of type ElemT that can fit in this memory |
| /// buffer. |
| size_t getElementCount() const { return ElementCount; } |
| |
| /// Returns the number of bytes that can fit in this memory buffer. |
| size_t getByteCount() const { return ElementCount * sizeof(ElemT); } |
| |
| /// Returns whether this is a single-element memory buffer. |
| bool isScalar() const { return getElementCount() == 1; } |
| |
| private: |
| /// Constructs a SharedDeviceMemory instance from an element count. |
| /// |
| /// This constructor is not public because there is a potential for confusion |
| /// between the size of the buffer in bytes and the size of the buffer in |
| /// elements. |
| /// |
| /// The static method makeFromElementCount is provided for users of this class |
| /// because its name makes the meaning of the size parameter clear. |
| explicit SharedDeviceMemory(size_t ElementCount) |
| : ElementCount(ElementCount) {} |
| |
| size_t ElementCount; |
| }; |
| |
| } // namespace streamexecutor |
| |
| #endif // STREAMEXECUTOR_DEVICEMEMORY_H |