| //===-- GPU memory allocator implementation ---------------------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements a parallel allocator intended for use on a GPU device. |
| // The core algorithm is slab allocator using a random walk over a bitfield for |
| // maximum parallel progress. Slab handling is done by a wait-free reference |
| // counted guard. The first use of a slab will create it from system memory for |
| // re-use. The last use will invalidate it and free the memory. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "allocator.h" |
| |
| #include "src/__support/CPP/atomic.h" |
| #include "src/__support/CPP/bit.h" |
| #include "src/__support/CPP/new.h" |
| #include "src/__support/GPU/utils.h" |
| #include "src/__support/RPC/rpc_client.h" |
| #include "src/__support/threads/sleep.h" |
| |
| namespace LIBC_NAMESPACE_DECL { |
| |
| constexpr static uint64_t MAX_SIZE = /* 64 GiB */ 64ull * 1024 * 1024 * 1024; |
| constexpr static uint64_t SLAB_SIZE = /* 2 MiB */ 2ull * 1024 * 1024; |
| constexpr static uint64_t ARRAY_SIZE = MAX_SIZE / SLAB_SIZE; |
| constexpr static uint64_t SLAB_ALIGNMENT = SLAB_SIZE - 1; |
| constexpr static uint32_t BITS_IN_WORD = sizeof(uint32_t) * 8; |
| constexpr static uint32_t MIN_SIZE = 16; |
| constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1; |
| |
| // A sentinel used to indicate an invalid but non-null pointer value. |
| constexpr static uint64_t SENTINEL = cpp::numeric_limits<uint64_t>::max(); |
| |
| // The number of times we will try starting on a single index before skipping |
| // past it. |
| constexpr static uint32_t MAX_TRIES = 512; |
| |
| static_assert(!(ARRAY_SIZE & (ARRAY_SIZE - 1)), "Must be a power of two"); |
| |
| namespace impl { |
| // Allocates more memory from the system through the RPC interface. All |
| // allocations from the system MUST be aligned on a 2MiB barrier. The default |
| // HSA allocator has this behavior for any allocation >= 2MiB and the CUDA |
| // driver provides an alignment field for virtual memory allocations. |
| static void *rpc_allocate(uint64_t size) { |
| void *ptr = nullptr; |
| rpc::Client::Port port = rpc::client.open<LIBC_MALLOC>(); |
| port.send_and_recv( |
| [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; }, |
| [&](rpc::Buffer *buffer, uint32_t) { |
| ptr = reinterpret_cast<void *>(buffer->data[0]); |
| }); |
| port.close(); |
| return ptr; |
| } |
| |
| // Deallocates the associated system memory. |
| static void rpc_free(void *ptr) { |
| rpc::Client::Port port = rpc::client.open<LIBC_FREE>(); |
| port.send([=](rpc::Buffer *buffer, uint32_t) { |
| buffer->data[0] = reinterpret_cast<uintptr_t>(ptr); |
| }); |
| port.close(); |
| } |
| |
| // Convert a potentially disjoint bitmask into an increasing integer per-lane |
| // for use with indexing between gpu lanes. |
| static inline uint32_t lane_count(uint64_t lane_mask) { |
| return cpp::popcount(lane_mask & ((uint64_t(1) << gpu::get_lane_id()) - 1)); |
| } |
| |
| // Obtain an initial value to seed a random number generator. We use the rounded |
| // multiples of the golden ratio from xorshift* as additional spreading. |
| static inline uint32_t entropy() { |
| return (static_cast<uint32_t>(gpu::processor_clock()) ^ |
| (gpu::get_thread_id_x() * 0x632be59b) ^ |
| (gpu::get_block_id_x() * 0x85157af5)) * |
| 0x9e3779bb; |
| } |
| |
| // Generate a random number and update the state using the xorshift32* PRNG. |
| static inline uint32_t xorshift32(uint32_t &state) { |
| state ^= state << 13; |
| state ^= state >> 17; |
| state ^= state << 5; |
| return state * 0x9e3779bb; |
| } |
| |
| // Final stage of murmurhash used to get a unique index for the global array |
| static inline uint32_t hash(uint32_t x) { |
| x ^= x >> 16; |
| x *= 0x85ebca6b; |
| x ^= x >> 13; |
| x *= 0xc2b2ae35; |
| x ^= x >> 16; |
| return x; |
| } |
| |
| // Rounds the input value to the closest permitted chunk size. Here we accept |
| // the sum of the closest three powers of two. For a 2MiB slab size this is 48 |
| // different chunk sizes. This gives us average internal fragmentation of 87.5%. |
| static inline uint32_t get_chunk_size(uint32_t x) { |
| uint32_t y = x < MIN_SIZE ? MIN_SIZE : x; |
| uint32_t pow2 = BITS_IN_WORD - cpp::countl_zero(y - 1); |
| |
| uint32_t s0 = 0b0100 << (pow2 - 3); |
| uint32_t s1 = 0b0110 << (pow2 - 3); |
| uint32_t s2 = 0b0111 << (pow2 - 3); |
| uint32_t s3 = 0b1000 << (pow2 - 3); |
| |
| if (s0 > y) |
| return (s0 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| if (s1 > y) |
| return (s1 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| if (s2 > y) |
| return (s2 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| return (s3 + MIN_ALIGNMENT) & ~MIN_ALIGNMENT; |
| } |
| |
| // Rounds to the nearest power of two. |
| template <uint32_t N, typename T> |
| static inline constexpr T round_up(const T x) { |
| static_assert(((N - 1) & N) == 0, "N must be a power of two"); |
| return (x + N) & ~(N - 1); |
| } |
| |
| // Perform a lane parallel memset on a uint32_t pointer. |
| void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) { |
| uint64_t mask = gpu::get_lane_mask(); |
| uint32_t workers = cpp::popcount(uniform); |
| for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers) |
| s[i] = c; |
| } |
| |
| } // namespace impl |
| |
| /// A slab allocator used to hand out identically sized slabs of memory. |
| /// Allocation is done through random walks of a bitfield until a free bit is |
| /// encountered. This reduces contention and is highly parallel on a GPU. |
| /// |
| /// 0 4 8 16 ... 2 MiB |
| /// ┌────────┬──────────┬────────┬──────────────────┬──────────────────────────┐ |
| /// │ chunk │ index │ pad │ bitfield[] │ memory[] │ |
| /// └────────┴──────────┴────────┴──────────────────┴──────────────────────────┘ |
| /// |
| /// The size of the bitfield is the slab size divided by the chunk size divided |
| /// by the number of bits per word. We pad the interface to ensure 16 byte |
| /// alignment and to indicate that if the pointer is not aligned by 2MiB it |
| /// belongs to a slab rather than the global allocator. |
| struct Slab { |
| // Header metadata for the slab, aligned to the minimum alignment. |
| struct alignas(MIN_SIZE) Header { |
| uint32_t chunk_size; |
| uint32_t global_index; |
| }; |
| |
| // Initialize the slab with its chunk size and index in the global table for |
| // use when freeing. |
| Slab(uint32_t chunk_size, uint32_t global_index) { |
| Header *header = reinterpret_cast<Header *>(memory); |
| header->chunk_size = chunk_size; |
| header->global_index = global_index; |
| } |
| |
| // Set the necessary bitfield bytes to zero in parallel using many lanes. This |
| // must be called before the bitfield can be accessed safely, memory is not |
| // guaranteed to be zero initialized in the current implementation. |
| void initialize(uint64_t uniform) { |
| uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) / |
| sizeof(uint32_t); |
| impl::uniform_memset(get_bitfield(), 0, size, uniform); |
| } |
| |
| // Get the number of chunks that can theoretically fit inside this slab. |
| constexpr static uint32_t num_chunks(uint32_t chunk_size) { |
| return SLAB_SIZE / chunk_size; |
| } |
| |
| // Get the number of bytes needed to contain the bitfield bits. |
| constexpr static uint32_t bitfield_bytes(uint32_t chunk_size) { |
| return ((num_chunks(chunk_size) + BITS_IN_WORD - 1) / BITS_IN_WORD) * 8; |
| } |
| |
| // The actual amount of memory available excluding the bitfield and metadata. |
| constexpr static uint32_t available_bytes(uint32_t chunk_size) { |
| return SLAB_SIZE - bitfield_bytes(chunk_size) - sizeof(Header); |
| } |
| |
| // The number of chunks that can be stored in this slab. |
| constexpr static uint32_t available_chunks(uint32_t chunk_size) { |
| return available_bytes(chunk_size) / chunk_size; |
| } |
| |
| // The length in bits of the bitfield. |
| constexpr static uint32_t usable_bits(uint32_t chunk_size) { |
| return available_bytes(chunk_size) / chunk_size; |
| } |
| |
| // Get the location in the memory where we will store the chunk size. |
| uint32_t get_chunk_size() const { |
| return reinterpret_cast<const Header *>(memory)->chunk_size; |
| } |
| |
| // Get the location in the memory where we will store the global index. |
| uint32_t get_global_index() const { |
| return reinterpret_cast<const Header *>(memory)->global_index; |
| } |
| |
| // Get a pointer to where the bitfield is located in the memory. |
| uint32_t *get_bitfield() { |
| return reinterpret_cast<uint32_t *>(memory + sizeof(Header)); |
| } |
| |
| // Get a pointer to where the actual memory to be allocated lives. |
| uint8_t *get_memory(uint32_t chunk_size) { |
| return reinterpret_cast<uint8_t *>(get_bitfield()) + |
| bitfield_bytes(chunk_size); |
| } |
| |
| // Get a pointer to the actual memory given an index into the bitfield. |
| void *ptr_from_index(uint32_t index, uint32_t chunk_size) { |
| return get_memory(chunk_size) + index * chunk_size; |
| } |
| |
| // Convert a pointer back into its bitfield index using its offset. |
| uint32_t index_from_ptr(void *ptr, uint32_t chunk_size) { |
| return static_cast<uint32_t>(reinterpret_cast<uint8_t *>(ptr) - |
| get_memory(chunk_size)) / |
| chunk_size; |
| } |
| |
| // Randomly walks the bitfield until it finds a free bit. Allocations attempt |
| // to put lanes right next to each other for better caching and convergence. |
| void *allocate(uint64_t lane_mask, uint64_t uniform) { |
| uint32_t chunk_size = get_chunk_size(); |
| uint32_t state = impl::entropy(); |
| |
| // The uniform mask represents which lanes contain a uniform target pointer. |
| // We attempt to place these next to each other. |
| void *result = nullptr; |
| for (uint64_t mask = lane_mask; mask; |
| mask = gpu::ballot(lane_mask, !result)) { |
| if (result) |
| continue; |
| |
| uint32_t start = gpu::broadcast_value(lane_mask, impl::xorshift32(state)); |
| |
| uint32_t id = impl::lane_count(uniform & mask); |
| uint32_t index = (start + id) % usable_bits(chunk_size); |
| uint32_t slot = index / BITS_IN_WORD; |
| uint32_t bit = index % BITS_IN_WORD; |
| |
| // Get the mask of bits destined for the same slot and coalesce it. |
| uint64_t match = uniform & gpu::match_any(mask, slot); |
| uint32_t length = cpp::popcount(match); |
| uint32_t bitmask = static_cast<uint32_t>((uint64_t(1) << length) - 1) |
| << bit; |
| |
| uint32_t before = 0; |
| if (gpu::get_lane_id() == static_cast<uint32_t>(cpp::countr_zero(match))) |
| before = cpp::AtomicRef(get_bitfield()[slot]) |
| .fetch_or(bitmask, cpp::MemoryOrder::RELAXED); |
| before = gpu::shuffle(mask, cpp::countr_zero(match), before); |
| if (~before & (1 << bit)) |
| result = ptr_from_index(index, chunk_size); |
| else |
| sleep_briefly(); |
| } |
| |
| cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| return result; |
| } |
| |
| // Deallocates memory by resetting its corresponding bit in the bitfield. |
| void deallocate(void *ptr) { |
| uint32_t chunk_size = get_chunk_size(); |
| uint32_t index = index_from_ptr(ptr, chunk_size); |
| uint32_t slot = index / BITS_IN_WORD; |
| uint32_t bit = index % BITS_IN_WORD; |
| |
| cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| cpp::AtomicRef(get_bitfield()[slot]) |
| .fetch_and(~(1u << bit), cpp::MemoryOrder::RELAXED); |
| } |
| |
| // The actual memory the slab will manage. All offsets are calculated at |
| // runtime with the chunk size to keep the interface convergent when a warp or |
| // wavefront is handling multiple sizes at once. |
| uint8_t memory[SLAB_SIZE]; |
| }; |
| |
| /// A wait-free guard around a pointer resource to be created dynamically if |
| /// space is available and freed once there are no more users. |
| struct GuardPtr { |
| private: |
| struct RefCounter { |
| // Indicates that the object is in its deallocation phase and thus invalid. |
| static constexpr uint64_t INVALID = uint64_t(1) << 63; |
| |
| // If a read preempts an unlock call we indicate this so the following |
| // unlock call can swap out the helped bit and maintain exclusive ownership. |
| static constexpr uint64_t HELPED = uint64_t(1) << 62; |
| |
| // Resets the reference counter, cannot be reset to zero safely. |
| void reset(uint32_t n, uint64_t &count) { |
| counter.store(n, cpp::MemoryOrder::RELAXED); |
| count = n; |
| } |
| |
| // Acquire a slot in the reference counter if it is not invalid. |
| bool acquire(uint32_t n, uint64_t &count) { |
| count = counter.fetch_add(n, cpp::MemoryOrder::RELAXED) + n; |
| return (count & INVALID) == 0; |
| } |
| |
| // Release a slot in the reference counter. This function should only be |
| // called following a valid acquire call. |
| bool release(uint32_t n) { |
| // If this thread caused the counter to reach zero we try to invalidate it |
| // and obtain exclusive rights to deconstruct it. If the CAS failed either |
| // another thread resurrected the counter and we quit, or a parallel read |
| // helped us invalidating it. For the latter, claim that flag and return. |
| if (counter.fetch_sub(n, cpp::MemoryOrder::RELAXED) == n) { |
| uint64_t expected = 0; |
| if (counter.compare_exchange_strong(expected, INVALID, |
| cpp::MemoryOrder::RELAXED, |
| cpp::MemoryOrder::RELAXED)) |
| return true; |
| else if ((expected & HELPED) && |
| (counter.exchange(INVALID, cpp::MemoryOrder::RELAXED) & |
| HELPED)) |
| return true; |
| } |
| return false; |
| } |
| |
| // Returns the current reference count, potentially helping a releasing |
| // thread. |
| uint64_t read() { |
| auto val = counter.load(cpp::MemoryOrder::RELAXED); |
| if (val == 0 && counter.compare_exchange_strong( |
| val, INVALID | HELPED, cpp::MemoryOrder::RELAXED)) |
| return 0; |
| return (val & INVALID) ? 0 : val; |
| } |
| |
| cpp::Atomic<uint64_t> counter{0}; |
| }; |
| |
| cpp::Atomic<Slab *> ptr{nullptr}; |
| RefCounter ref{}; |
| |
| // Should be called be a single lane for each different pointer. |
| template <typename... Args> |
| Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { |
| Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); |
| if (!expected && |
| ptr.compare_exchange_strong( |
| expected, reinterpret_cast<Slab *>(SENTINEL), |
| cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { |
| count = cpp::numeric_limits<uint64_t>::max(); |
| void *raw = impl::rpc_allocate(sizeof(Slab)); |
| if (!raw) |
| return nullptr; |
| return new (raw) Slab(cpp::forward<Args>(args)...); |
| } |
| |
| if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL)) |
| return nullptr; |
| |
| if (!ref.acquire(n, count)) |
| return nullptr; |
| |
| cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| return ptr.load(cpp::MemoryOrder::RELAXED); |
| } |
| |
| // Finalize the associated memory and signal that it is ready to use by |
| // resetting the counter. |
| void finalize(Slab *mem, uint32_t n, uint64_t &count) { |
| cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| ptr.store(mem, cpp::MemoryOrder::RELAXED); |
| cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); |
| if (!ref.acquire(n, count)) |
| ref.reset(n, count); |
| } |
| |
| public: |
| // Attempt to lock access to the pointer, potentially creating it if empty. |
| // The uniform mask represents which lanes share the same pointer. For each |
| // uniform value we elect a leader to handle it on behalf of the other lanes. |
| template <typename... Args> |
| Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, |
| Args &&...args) { |
| count = 0; |
| Slab *result = nullptr; |
| if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
| result = try_lock_impl(cpp::popcount(uniform), count, |
| cpp::forward<Args>(args)...); |
| result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result); |
| count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count); |
| |
| if (!result) |
| return nullptr; |
| |
| // We defer storing the newly allocated slab until now so that we can use |
| // multiple lanes to initialize it and release it for use. |
| if (count == cpp::numeric_limits<uint64_t>::max()) { |
| result->initialize(uniform); |
| if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) |
| finalize(result, cpp::popcount(uniform), count); |
| } |
| |
| if (count != cpp::numeric_limits<uint64_t>::max()) |
| count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; |
| |
| return result; |
| } |
| |
| // Release the associated lock on the pointer, potentially destroying it. |
| void unlock(uint64_t lane_mask, uint64_t mask) { |
| cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && |
| ref.release(cpp::popcount(mask))) { |
| Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); |
| p->~Slab(); |
| impl::rpc_free(p); |
| cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); |
| ptr.store(nullptr, cpp::MemoryOrder::RELAXED); |
| } |
| gpu::sync_lane(lane_mask); |
| } |
| |
| // Get the current value of the reference counter. |
| uint64_t use_count() { return ref.read(); } |
| }; |
| |
| // The global array used to search for a valid slab to allocate from. |
| static GuardPtr slots[ARRAY_SIZE] = {}; |
| |
| // Tries to find a slab in the table that can support the given chunk size. |
| static Slab *find_slab(uint32_t chunk_size) { |
| // We start at a hashed value to spread out different chunk sizes. |
| uint32_t start = impl::hash(chunk_size); |
| uint64_t lane_mask = gpu::get_lane_mask(); |
| uint64_t uniform = gpu::match_any(lane_mask, chunk_size); |
| |
| Slab *result = nullptr; |
| uint32_t nudge = 0; |
| for (uint64_t mask = lane_mask; mask; |
| mask = gpu::ballot(lane_mask, !result), ++nudge) { |
| uint32_t index = cpp::numeric_limits<uint32_t>::max(); |
| for (uint32_t offset = nudge / MAX_TRIES; |
| gpu::ballot(lane_mask, index == cpp::numeric_limits<uint32_t>::max()); |
| offset += cpp::popcount(uniform & lane_mask)) { |
| uint32_t candidate = |
| (start + offset + impl::lane_count(uniform & lane_mask)) % ARRAY_SIZE; |
| uint64_t available = |
| gpu::ballot(lane_mask, slots[candidate].use_count() < |
| Slab::available_chunks(chunk_size)); |
| uint32_t new_index = gpu::shuffle( |
| lane_mask, cpp::countr_zero(available & uniform), candidate); |
| |
| // Each uniform group will use the first empty slot they find. |
| if ((index == cpp::numeric_limits<uint32_t>::max() && |
| (available & uniform))) |
| index = new_index; |
| |
| // Guaruntees that this loop will eventuall exit if there is no space. |
| if (offset >= ARRAY_SIZE) { |
| result = reinterpret_cast<Slab *>(SENTINEL); |
| index = 0; |
| } |
| } |
| |
| // Try to claim a slot for the found slot. |
| if (!result) { |
| uint64_t reserved = 0; |
| Slab *slab = slots[index].try_lock(lane_mask & mask, uniform & mask, |
| reserved, chunk_size, index); |
| // If we find a slab with a matching chunk size then we store the result. |
| // Otherwise, we need to free the claimed lock and continue. In the case |
| // of out-of-memory we return a sentinel value. |
| if (slab && reserved <= Slab::available_chunks(chunk_size) && |
| slab->get_chunk_size() == chunk_size) { |
| result = slab; |
| } else if (slab && (reserved > Slab::available_chunks(chunk_size) || |
| slab->get_chunk_size() != chunk_size)) { |
| if (slab->get_chunk_size() != chunk_size) |
| start = index + 1; |
| slots[index].unlock(gpu::get_lane_mask(), |
| gpu::get_lane_mask() & uniform); |
| } else if (!slab && reserved == cpp::numeric_limits<uint64_t>::max()) { |
| result = reinterpret_cast<Slab *>(SENTINEL); |
| } else { |
| sleep_briefly(); |
| } |
| } |
| } |
| return result; |
| } |
| |
| // Release the lock associated with a given slab. |
| static void release_slab(Slab *slab) { |
| uint32_t index = slab->get_global_index(); |
| uint64_t lane_mask = gpu::get_lane_mask(); |
| uint64_t uniform = gpu::match_any(lane_mask, index); |
| slots[index].unlock(lane_mask, uniform); |
| } |
| |
| namespace gpu { |
| |
| void *allocate(uint64_t size) { |
| if (!size) |
| return nullptr; |
| |
| // Allocations requiring a full slab or more go directly to memory. |
| if (size >= SLAB_SIZE / 2) |
| return impl::rpc_allocate(impl::round_up<SLAB_SIZE>(size)); |
| |
| // Try to find a slab for the rounded up chunk size and allocate from it. |
| uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size)); |
| Slab *slab = find_slab(chunk_size); |
| if (!slab || slab == reinterpret_cast<Slab *>(SENTINEL)) |
| return nullptr; |
| |
| uint64_t lane_mask = gpu::get_lane_mask(); |
| uint64_t uniform = gpu::match_any(lane_mask, slab->get_global_index()); |
| void *ptr = slab->allocate(lane_mask, uniform); |
| return ptr; |
| } |
| |
| void deallocate(void *ptr) { |
| if (!ptr) |
| return; |
| |
| // All non-slab allocations will be aligned on a 2MiB boundary. |
| if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0) |
| return impl::rpc_free(ptr); |
| |
| // The original slab pointer is the 2MiB boundary using the given pointer. |
| Slab *slab = reinterpret_cast<Slab *>( |
| (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)); |
| slab->deallocate(ptr); |
| release_slab(slab); |
| } |
| |
| } // namespace gpu |
| } // namespace LIBC_NAMESPACE_DECL |