| //===-- Shared memory RPC client / server utilities -------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_LIBC_SHARED_RPC_UTIL_H |
| #define LLVM_LIBC_SHARED_RPC_UTIL_H |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #if (defined(__NVPTX__) || defined(__AMDGPU__)) && \ |
| !((defined(__CUDA__) && !defined(__CUDA_ARCH__)) || \ |
| (defined(__HIP__) && !defined(__HIP_DEVICE_COMPILE__))) |
| #include <gpuintrin.h> |
| #define RPC_TARGET_IS_GPU |
| #endif |
| |
| // Workaround for missing __has_builtin in < GCC 10. |
| #ifndef __has_builtin |
| #define __has_builtin(x) 0 |
| #endif |
| |
| #ifndef RPC_ATTRS |
| #if defined(__CUDA__) || defined(__HIP__) |
| #define RPC_ATTRS __attribute__((host, device)) inline |
| #else |
| #define RPC_ATTRS inline |
| #endif |
| #endif |
| |
| namespace rpc { |
| |
| template <typename T> struct type_identity { |
| using type = T; |
| }; |
| |
| template <class T, T v> struct type_constant { |
| static inline constexpr T value = v; |
| }; |
| |
| template <class T> struct remove_reference : type_identity<T> {}; |
| template <class T> struct remove_reference<T &> : type_identity<T> {}; |
| template <class T> struct remove_reference<T &&> : type_identity<T> {}; |
| |
| template <class T> struct is_const : type_constant<bool, false> {}; |
| template <class T> struct is_const<const T> : type_constant<bool, true> {}; |
| |
| /// Freestanding implementation of std::move. |
| template <class T> |
| RPC_ATTRS constexpr typename remove_reference<T>::type &&move(T &&t) { |
| return static_cast<typename remove_reference<T>::type &&>(t); |
| } |
| |
| /// Freestanding implementation of std::forward. |
| template <typename T> |
| RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &value) { |
| return static_cast<T &&>(value); |
| } |
| template <typename T> |
| RPC_ATTRS constexpr T &&forward(typename remove_reference<T>::type &&value) { |
| return static_cast<T &&>(value); |
| } |
| |
| struct in_place_t { |
| RPC_ATTRS explicit in_place_t() = default; |
| }; |
| |
| struct nullopt_t { |
| RPC_ATTRS constexpr explicit nullopt_t() = default; |
| }; |
| |
| constexpr inline in_place_t in_place{}; |
| constexpr inline nullopt_t nullopt{}; |
| |
| /// Freestanding and minimal implementation of std::optional. |
| template <typename T> class optional { |
| template <typename U> struct OptionalStorage { |
| union { |
| char empty; |
| U stored_value; |
| }; |
| |
| bool in_use = false; |
| |
| RPC_ATTRS ~OptionalStorage() { reset(); } |
| |
| RPC_ATTRS constexpr OptionalStorage() : empty() {} |
| |
| template <typename... Args> |
| RPC_ATTRS constexpr explicit OptionalStorage(in_place_t, Args &&...args) |
| : stored_value(forward<Args>(args)...) {} |
| |
| RPC_ATTRS constexpr void reset() { |
| if (in_use) |
| stored_value.~U(); |
| in_use = false; |
| } |
| }; |
| |
| OptionalStorage<T> storage; |
| |
| public: |
| RPC_ATTRS constexpr optional() = default; |
| RPC_ATTRS constexpr optional(nullopt_t) {} |
| |
| RPC_ATTRS constexpr optional(const T &t) : storage(in_place, t) { |
| storage.in_use = true; |
| } |
| RPC_ATTRS constexpr optional(const optional &) = default; |
| |
| RPC_ATTRS constexpr optional(T &&t) : storage(in_place, move(t)) { |
| storage.in_use = true; |
| } |
| RPC_ATTRS constexpr optional(optional &&O) = default; |
| |
| RPC_ATTRS constexpr optional &operator=(T &&t) { |
| storage = move(t); |
| return *this; |
| } |
| RPC_ATTRS constexpr optional &operator=(optional &&) = default; |
| |
| RPC_ATTRS constexpr optional &operator=(const T &t) { |
| storage = t; |
| return *this; |
| } |
| RPC_ATTRS constexpr optional &operator=(const optional &) = default; |
| |
| RPC_ATTRS constexpr void reset() { storage.reset(); } |
| |
| RPC_ATTRS constexpr const T &value() const & { return storage.stored_value; } |
| |
| RPC_ATTRS constexpr T &value() & { return storage.stored_value; } |
| |
| RPC_ATTRS constexpr explicit operator bool() const { return storage.in_use; } |
| RPC_ATTRS constexpr bool has_value() const { return storage.in_use; } |
| RPC_ATTRS constexpr const T *operator->() const { |
| return &storage.stored_value; |
| } |
| RPC_ATTRS constexpr T *operator->() { return &storage.stored_value; } |
| RPC_ATTRS constexpr const T &operator*() const & { |
| return storage.stored_value; |
| } |
| RPC_ATTRS constexpr T &operator*() & { return storage.stored_value; } |
| |
| RPC_ATTRS constexpr T &&value() && { return move(storage.stored_value); } |
| RPC_ATTRS constexpr T &&operator*() && { return move(storage.stored_value); } |
| }; |
| |
| /// Suspend the thread briefly to assist the thread scheduler during busy loops. |
| RPC_ATTRS void sleep_briefly() { |
| #if __has_builtin(__nvvm_reflect) |
| if (__nvvm_reflect("__CUDA_ARCH") >= 700) |
| asm("nanosleep.u32 64;" ::: "memory"); |
| #elif __has_builtin(__builtin_amdgcn_s_sleep) |
| __builtin_amdgcn_s_sleep(2); |
| #elif __has_builtin(__builtin_ia32_pause) |
| __builtin_ia32_pause(); |
| #elif __has_builtin(__builtin_arm_isb) |
| __builtin_arm_isb(0xf); |
| #else |
| // Simply do nothing if sleeping isn't supported on this platform. |
| #endif |
| } |
| |
| /// Conditional to indicate if this process is running on the GPU. |
| RPC_ATTRS constexpr bool is_process_gpu() { |
| #ifdef RPC_TARGET_IS_GPU |
| return true; |
| #else |
| return false; |
| #endif |
| } |
| |
| /// Wait for all lanes in the group to complete. |
| RPC_ATTRS void sync_lane([[maybe_unused]] uint64_t lane_mask) { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_sync_lane(lane_mask); |
| #endif |
| } |
| |
| /// Copies the value from the first active thread to the rest. |
| RPC_ATTRS uint32_t broadcast_value([[maybe_unused]] uint64_t lane_mask, |
| uint32_t x) { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_read_first_lane_u32(lane_mask, x); |
| #else |
| return x; |
| #endif |
| } |
| |
| /// Returns the number lanes that participate in the RPC interface. |
| RPC_ATTRS uint32_t get_num_lanes() { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_num_lanes(); |
| #else |
| return 1; |
| #endif |
| } |
| |
| /// Returns the id of the thread inside of an AMD wavefront executing together. |
| RPC_ATTRS uint64_t get_lane_mask() { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_lane_mask(); |
| #else |
| return 1; |
| #endif |
| } |
| |
| /// Returns the id of the thread inside of an AMD wavefront executing together. |
| RPC_ATTRS uint32_t get_lane_id() { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_lane_id(); |
| #else |
| return 0; |
| #endif |
| } |
| |
| /// Conditional that is only true for a single thread in a lane. |
| RPC_ATTRS bool is_first_lane([[maybe_unused]] uint64_t lane_mask) { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_is_first_in_lane(lane_mask); |
| #else |
| return true; |
| #endif |
| } |
| |
| /// Returns a bitmask of threads in the current lane for which \p x is true. |
| RPC_ATTRS uint64_t ballot([[maybe_unused]] uint64_t lane_mask, bool x) { |
| #ifdef RPC_TARGET_IS_GPU |
| return __gpu_ballot(lane_mask, x); |
| #else |
| return x; |
| #endif |
| } |
| |
| /// Return \p val aligned "upwards" according to \p align. |
| template <typename V, typename A> |
| RPC_ATTRS constexpr V align_up(V val, A align) { |
| return ((val + V(align) - 1) / V(align)) * V(align); |
| } |
| |
| /// Utility to provide a unified interface between the CPU and GPU's memory |
| /// model. On the GPU stack variables are always private to a lane so we can |
| /// simply use the variable passed in. On the CPU we need to allocate enough |
| /// space for the whole lane and index into it. |
| template <typename V> RPC_ATTRS V &lane_value(V *val, uint32_t id) { |
| if constexpr (is_process_gpu()) |
| return *val; |
| return val[id]; |
| } |
| |
| /// Advance the \p p by \p bytes. |
| template <typename T, typename U> RPC_ATTRS T *advance(T *ptr, U bytes) { |
| if constexpr (is_const<T>::value) |
| return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) + |
| bytes); |
| else |
| return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes); |
| } |
| |
| /// Wrapper around the optimal memory copy implementation for the target. |
| RPC_ATTRS void rpc_memcpy(void *dst, const void *src, size_t count) { |
| __builtin_memcpy(dst, src, count); |
| } |
| |
| template <class T> RPC_ATTRS constexpr const T &max(const T &a, const T &b) { |
| return (a < b) ? b : a; |
| } |
| |
| } // namespace rpc |
| |
| #endif // LLVM_LIBC_SHARED_RPC_UTIL_H |